• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35 
36 #include "util/ralloc.h"
37 
38 /**
39  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40  * registers, implicitly moving the operand to a message register.
41  *
42  * On Sandybridge, this is no longer the case.  This function performs the
43  * explicit move; it should be called before emitting a SEND instruction.
44  */
45 void
gen6_resolve_implied_move(struct brw_codegen * p,struct brw_reg * src,unsigned msg_reg_nr)46 gen6_resolve_implied_move(struct brw_codegen *p,
47 			  struct brw_reg *src,
48 			  unsigned msg_reg_nr)
49 {
50    const struct gen_device_info *devinfo = p->devinfo;
51    if (devinfo->gen < 6)
52       return;
53 
54    if (src->file == BRW_MESSAGE_REGISTER_FILE)
55       return;
56 
57    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58       brw_push_insn_state(p);
59       brw_set_default_exec_size(p, BRW_EXECUTE_8);
60       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63 	      retype(*src, BRW_REGISTER_TYPE_UD));
64       brw_pop_insn_state(p);
65    }
66    *src = brw_message_reg(msg_reg_nr);
67 }
68 
69 static void
gen7_convert_mrf_to_grf(struct brw_codegen * p,struct brw_reg * reg)70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71 {
72    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73     * "The send with EOT should use register space R112-R127 for <src>. This is
74     *  to enable loading of a new thread into the same slot while the message
75     *  with EOT for current thread is pending dispatch."
76     *
77     * Since we're pretending to have 16 MRFs anyway, we may as well use the
78     * registers required for messages with EOT.
79     */
80    const struct gen_device_info *devinfo = p->devinfo;
81    if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82       reg->file = BRW_GENERAL_REGISTER_FILE;
83       reg->nr += GEN7_MRF_HACK_START;
84    }
85 }
86 
87 void
brw_set_dest(struct brw_codegen * p,brw_inst * inst,struct brw_reg dest)88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89 {
90    const struct gen_device_info *devinfo = p->devinfo;
91 
92    if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93       assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94    else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
95       assert(dest.nr < 128);
96 
97    gen7_convert_mrf_to_grf(p, &dest);
98 
99    brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
100    brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
101 
102    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
103       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
104 
105       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
106          brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
107 	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
108 	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
109          brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
110       } else {
111          brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
112          brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
113          if (dest.file == BRW_GENERAL_REGISTER_FILE ||
114              dest.file == BRW_MESSAGE_REGISTER_FILE) {
115             assert(dest.writemask != 0);
116          }
117 	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
118 	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
119 	  *    this to be programmed as "01".
120 	  */
121          brw_inst_set_dst_hstride(devinfo, inst, 1);
122       }
123    } else {
124       brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
125 
126       /* These are different sizes in align1 vs align16:
127        */
128       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
129          brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
130                                        dest.indirect_offset);
131 	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
132 	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
133          brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
134       } else {
135          brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
136                                         dest.indirect_offset);
137 	 /* even ignored in da16, still need to set as '01' */
138          brw_inst_set_dst_hstride(devinfo, inst, 1);
139       }
140    }
141 
142    /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
143     * or 16 (SIMD16), as that's normally correct.  However, when dealing with
144     * small registers, it can be useful for us to automatically reduce it to
145     * match the register size.
146     */
147    if (p->automatic_exec_sizes) {
148       /*
149        * In platforms that support fp64 we can emit instructions with a width
150        * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
151        * these cases we need to make sure that these instructions have their
152        * exec sizes set properly when they are emitted and we can't rely on
153        * this code to fix it.
154        */
155       bool fix_exec_size;
156       if (devinfo->gen >= 6)
157          fix_exec_size = dest.width < BRW_EXECUTE_4;
158       else
159          fix_exec_size = dest.width < BRW_EXECUTE_8;
160 
161       if (fix_exec_size)
162          brw_inst_set_exec_size(devinfo, inst, dest.width);
163    }
164 }
165 
166 void
brw_set_src0(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)167 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
168 {
169    const struct gen_device_info *devinfo = p->devinfo;
170 
171    if (reg.file == BRW_MESSAGE_REGISTER_FILE)
172       assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
173    else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
174       assert(reg.nr < 128);
175 
176    gen7_convert_mrf_to_grf(p, &reg);
177 
178    if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
179                              brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
180       /* Any source modifiers or regions will be ignored, since this just
181        * identifies the MRF/GRF to start reading the message contents from.
182        * Check for some likely failures.
183        */
184       assert(!reg.negate);
185       assert(!reg.abs);
186       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
187    }
188 
189    brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
190    brw_inst_set_src0_abs(devinfo, inst, reg.abs);
191    brw_inst_set_src0_negate(devinfo, inst, reg.negate);
192    brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
193 
194    if (reg.file == BRW_IMMEDIATE_VALUE) {
195       if (reg.type == BRW_REGISTER_TYPE_DF ||
196           brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
197          brw_inst_set_imm_df(devinfo, inst, reg.df);
198       else if (reg.type == BRW_REGISTER_TYPE_UQ ||
199                reg.type == BRW_REGISTER_TYPE_Q)
200          brw_inst_set_imm_uq(devinfo, inst, reg.u64);
201       else
202          brw_inst_set_imm_ud(devinfo, inst, reg.ud);
203 
204       if (type_sz(reg.type) < 8) {
205          brw_inst_set_src1_reg_file(devinfo, inst,
206                                     BRW_ARCHITECTURE_REGISTER_FILE);
207          brw_inst_set_src1_reg_hw_type(devinfo, inst,
208                                        brw_inst_src0_reg_hw_type(devinfo, inst));
209       }
210    } else {
211       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
212          brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
213          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
214              brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
215 	 } else {
216             brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
217 	 }
218       } else {
219          brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
220 
221          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
222             brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
223 	 } else {
224             brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
225 	 }
226       }
227 
228       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
229 	 if (reg.width == BRW_WIDTH_1 &&
230              brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
231             brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
232             brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
233             brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
234 	 } else {
235             brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
236             brw_inst_set_src0_width(devinfo, inst, reg.width);
237             brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
238 	 }
239       } else {
240          brw_inst_set_src0_da16_swiz_x(devinfo, inst,
241             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
242          brw_inst_set_src0_da16_swiz_y(devinfo, inst,
243             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
244          brw_inst_set_src0_da16_swiz_z(devinfo, inst,
245             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
246          brw_inst_set_src0_da16_swiz_w(devinfo, inst,
247             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
248 
249          if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
250             /* This is an oddity of the fact we're using the same
251              * descriptions for registers in align_16 as align_1:
252              */
253             brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
254          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
255                     reg.type == BRW_REGISTER_TYPE_DF &&
256                     reg.vstride == BRW_VERTICAL_STRIDE_2) {
257             /* From SNB PRM:
258              *
259              * "For Align16 access mode, only encodings of 0000 and 0011
260              *  are allowed. Other codes are reserved."
261              *
262              * Presumably the DevSNB behavior applies to IVB as well.
263              */
264             brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
265          } else {
266             brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
267          }
268       }
269    }
270 }
271 
272 
273 void
brw_set_src1(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)274 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
275 {
276    const struct gen_device_info *devinfo = p->devinfo;
277 
278    if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
279       assert(reg.nr < 128);
280 
281    /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
282     *
283     *    "Accumulator registers may be accessed explicitly as src0
284     *    operands only."
285     */
286    assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
287           reg.nr != BRW_ARF_ACCUMULATOR);
288 
289    gen7_convert_mrf_to_grf(p, &reg);
290    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
291 
292    brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
293    brw_inst_set_src1_abs(devinfo, inst, reg.abs);
294    brw_inst_set_src1_negate(devinfo, inst, reg.negate);
295 
296    /* Only src1 can be immediate in two-argument instructions.
297     */
298    assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
299 
300    if (reg.file == BRW_IMMEDIATE_VALUE) {
301       /* two-argument instructions can only use 32-bit immediates */
302       assert(type_sz(reg.type) < 8);
303       brw_inst_set_imm_ud(devinfo, inst, reg.ud);
304    } else {
305       /* This is a hardware restriction, which may or may not be lifted
306        * in the future:
307        */
308       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
309       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
310 
311       brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
312       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
313          brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
314       } else {
315          brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
316       }
317 
318       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
319 	 if (reg.width == BRW_WIDTH_1 &&
320              brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
321             brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
322             brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
323             brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
324 	 } else {
325             brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
326             brw_inst_set_src1_width(devinfo, inst, reg.width);
327             brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
328 	 }
329       } else {
330          brw_inst_set_src1_da16_swiz_x(devinfo, inst,
331             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
332          brw_inst_set_src1_da16_swiz_y(devinfo, inst,
333             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
334          brw_inst_set_src1_da16_swiz_z(devinfo, inst,
335             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
336          brw_inst_set_src1_da16_swiz_w(devinfo, inst,
337             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
338 
339          if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
340             /* This is an oddity of the fact we're using the same
341              * descriptions for registers in align_16 as align_1:
342              */
343             brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
344          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
345                     reg.type == BRW_REGISTER_TYPE_DF &&
346                     reg.vstride == BRW_VERTICAL_STRIDE_2) {
347             /* From SNB PRM:
348              *
349              * "For Align16 access mode, only encodings of 0000 and 0011
350              *  are allowed. Other codes are reserved."
351              *
352              * Presumably the DevSNB behavior applies to IVB as well.
353              */
354             brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
355          } else {
356             brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
357          }
358       }
359    }
360 }
361 
362 /**
363  * Set the Message Descriptor and Extended Message Descriptor fields
364  * for SEND messages.
365  *
366  * \note This zeroes out the Function Control bits, so it must be called
367  *       \b before filling out any message-specific data.  Callers can
368  *       choose not to fill in irrelevant bits; they will be zero.
369  */
370 void
brw_set_message_descriptor(struct brw_codegen * p,brw_inst * inst,enum brw_message_target sfid,unsigned msg_length,unsigned response_length,bool header_present,bool end_of_thread)371 brw_set_message_descriptor(struct brw_codegen *p,
372 			   brw_inst *inst,
373 			   enum brw_message_target sfid,
374 			   unsigned msg_length,
375 			   unsigned response_length,
376 			   bool header_present,
377 			   bool end_of_thread)
378 {
379    const struct gen_device_info *devinfo = p->devinfo;
380 
381    brw_set_src1(p, inst, brw_imm_d(0));
382 
383    /* For indirect sends, `inst` will not be the SEND/SENDC instruction
384     * itself; instead, it will be a MOV/OR into the address register.
385     *
386     * In this case, we avoid setting the extended message descriptor bits,
387     * since they go on the later SEND/SENDC instead and if set here would
388     * instead clobber the conditionalmod bits.
389     */
390    unsigned opcode = brw_inst_opcode(devinfo, inst);
391    if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
392       brw_inst_set_sfid(devinfo, inst, sfid);
393    }
394 
395    brw_inst_set_mlen(devinfo, inst, msg_length);
396    brw_inst_set_rlen(devinfo, inst, response_length);
397    brw_inst_set_eot(devinfo, inst, end_of_thread);
398 
399    if (devinfo->gen >= 5) {
400       brw_inst_set_header_present(devinfo, inst, header_present);
401    }
402 }
403 
brw_set_math_message(struct brw_codegen * p,brw_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)404 static void brw_set_math_message( struct brw_codegen *p,
405 				  brw_inst *inst,
406 				  unsigned function,
407 				  unsigned integer_type,
408 				  bool low_precision,
409 				  unsigned dataType )
410 {
411    const struct gen_device_info *devinfo = p->devinfo;
412    unsigned msg_length;
413    unsigned response_length;
414 
415    /* Infer message length from the function */
416    switch (function) {
417    case BRW_MATH_FUNCTION_POW:
418    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
419    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
420    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
421       msg_length = 2;
422       break;
423    default:
424       msg_length = 1;
425       break;
426    }
427 
428    /* Infer response length from the function */
429    switch (function) {
430    case BRW_MATH_FUNCTION_SINCOS:
431    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
432       response_length = 2;
433       break;
434    default:
435       response_length = 1;
436       break;
437    }
438 
439 
440    brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
441 			      msg_length, response_length, false, false);
442    brw_inst_set_math_msg_function(devinfo, inst, function);
443    brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
444    brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
445    brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
446    brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
447    brw_inst_set_saturate(devinfo, inst, 0);
448 }
449 
450 
brw_set_ff_sync_message(struct brw_codegen * p,brw_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)451 static void brw_set_ff_sync_message(struct brw_codegen *p,
452 				    brw_inst *insn,
453 				    bool allocate,
454 				    unsigned response_length,
455 				    bool end_of_thread)
456 {
457    const struct gen_device_info *devinfo = p->devinfo;
458 
459    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
460 			      1, response_length, true, end_of_thread);
461    brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
462    brw_inst_set_urb_allocate(devinfo, insn, allocate);
463    /* The following fields are not used by FF_SYNC: */
464    brw_inst_set_urb_global_offset(devinfo, insn, 0);
465    brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
466    brw_inst_set_urb_used(devinfo, insn, 0);
467    brw_inst_set_urb_complete(devinfo, insn, 0);
468 }
469 
brw_set_urb_message(struct brw_codegen * p,brw_inst * insn,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)470 static void brw_set_urb_message( struct brw_codegen *p,
471 				 brw_inst *insn,
472                                  enum brw_urb_write_flags flags,
473 				 unsigned msg_length,
474 				 unsigned response_length,
475 				 unsigned offset,
476 				 unsigned swizzle_control )
477 {
478    const struct gen_device_info *devinfo = p->devinfo;
479 
480    assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
481    assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
482    assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
483 
484    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
485 			      msg_length, response_length, true,
486                               flags & BRW_URB_WRITE_EOT);
487 
488    if (flags & BRW_URB_WRITE_OWORD) {
489       assert(msg_length == 2); /* header + one OWORD of data */
490       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
491    } else {
492       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
493    }
494 
495    brw_inst_set_urb_global_offset(devinfo, insn, offset);
496    brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
497 
498    if (devinfo->gen < 8) {
499       brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
500    }
501 
502    if (devinfo->gen < 7) {
503       brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
504       brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
505    } else {
506       brw_inst_set_urb_per_slot_offset(devinfo, insn,
507          !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
508    }
509 }
510 
511 void
brw_set_dp_write_message(struct brw_codegen * p,brw_inst * insn,unsigned binding_table_index,unsigned msg_control,unsigned msg_type,unsigned target_cache,unsigned msg_length,bool header_present,unsigned last_render_target,unsigned response_length,unsigned end_of_thread,unsigned send_commit_msg)512 brw_set_dp_write_message(struct brw_codegen *p,
513 			 brw_inst *insn,
514 			 unsigned binding_table_index,
515 			 unsigned msg_control,
516 			 unsigned msg_type,
517                          unsigned target_cache,
518 			 unsigned msg_length,
519 			 bool header_present,
520 			 unsigned last_render_target,
521 			 unsigned response_length,
522 			 unsigned end_of_thread,
523 			 unsigned send_commit_msg)
524 {
525    const struct gen_device_info *devinfo = p->devinfo;
526    const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
527                           BRW_SFID_DATAPORT_WRITE);
528 
529    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
530 			      header_present, end_of_thread);
531 
532    brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
533    brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
534    brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
535    brw_inst_set_rt_last(devinfo, insn, last_render_target);
536    if (devinfo->gen < 7) {
537       brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
538    }
539 }
540 
541 void
brw_set_dp_read_message(struct brw_codegen * p,brw_inst * insn,unsigned binding_table_index,unsigned msg_control,unsigned msg_type,unsigned target_cache,unsigned msg_length,bool header_present,unsigned response_length)542 brw_set_dp_read_message(struct brw_codegen *p,
543 			brw_inst *insn,
544 			unsigned binding_table_index,
545 			unsigned msg_control,
546 			unsigned msg_type,
547 			unsigned target_cache,
548 			unsigned msg_length,
549                         bool header_present,
550 			unsigned response_length)
551 {
552    const struct gen_device_info *devinfo = p->devinfo;
553    const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
554                           BRW_SFID_DATAPORT_READ);
555 
556    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
557 			      header_present, false);
558 
559    brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
560    brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
561    brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
562    if (devinfo->gen < 6)
563       brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
564 }
565 
566 void
brw_set_sampler_message(struct brw_codegen * p,brw_inst * inst,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)567 brw_set_sampler_message(struct brw_codegen *p,
568                         brw_inst *inst,
569                         unsigned binding_table_index,
570                         unsigned sampler,
571                         unsigned msg_type,
572                         unsigned response_length,
573                         unsigned msg_length,
574                         unsigned header_present,
575                         unsigned simd_mode,
576                         unsigned return_format)
577 {
578    const struct gen_device_info *devinfo = p->devinfo;
579 
580    brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
581 			      response_length, header_present, false);
582 
583    brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
584    brw_inst_set_sampler(devinfo, inst, sampler);
585    brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
586    if (devinfo->gen >= 5) {
587       brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
588    } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
589       brw_inst_set_sampler_return_format(devinfo, inst, return_format);
590    }
591 }
592 
593 static void
gen7_set_dp_scratch_message(struct brw_codegen * p,brw_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)594 gen7_set_dp_scratch_message(struct brw_codegen *p,
595                             brw_inst *inst,
596                             bool write,
597                             bool dword,
598                             bool invalidate_after_read,
599                             unsigned num_regs,
600                             unsigned addr_offset,
601                             unsigned mlen,
602                             unsigned rlen,
603                             bool header_present)
604 {
605    const struct gen_device_info *devinfo = p->devinfo;
606    assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
607           (devinfo->gen >= 8 && num_regs == 8));
608    const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
609                                 num_regs - 1);
610 
611    brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
612                               mlen, rlen, header_present, false);
613    brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
614    brw_inst_set_scratch_read_write(devinfo, inst, write);
615    brw_inst_set_scratch_type(devinfo, inst, dword);
616    brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
617    brw_inst_set_scratch_block_size(devinfo, inst, block_size);
618    brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
619 }
620 
621 #define next_insn brw_next_insn
622 brw_inst *
brw_next_insn(struct brw_codegen * p,unsigned opcode)623 brw_next_insn(struct brw_codegen *p, unsigned opcode)
624 {
625    const struct gen_device_info *devinfo = p->devinfo;
626    brw_inst *insn;
627 
628    if (p->nr_insn + 1 > p->store_size) {
629       p->store_size <<= 1;
630       p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
631    }
632 
633    p->next_insn_offset += 16;
634    insn = &p->store[p->nr_insn++];
635    memcpy(insn, p->current, sizeof(*insn));
636 
637    brw_inst_set_opcode(devinfo, insn, opcode);
638    return insn;
639 }
640 
641 static brw_inst *
brw_alu1(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)642 brw_alu1(struct brw_codegen *p, unsigned opcode,
643          struct brw_reg dest, struct brw_reg src)
644 {
645    brw_inst *insn = next_insn(p, opcode);
646    brw_set_dest(p, insn, dest);
647    brw_set_src0(p, insn, src);
648    return insn;
649 }
650 
651 static brw_inst *
brw_alu2(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)652 brw_alu2(struct brw_codegen *p, unsigned opcode,
653          struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
654 {
655    /* 64-bit immediates are only supported on 1-src instructions */
656    assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
657    assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
658 
659    brw_inst *insn = next_insn(p, opcode);
660    brw_set_dest(p, insn, dest);
661    brw_set_src0(p, insn, src0);
662    brw_set_src1(p, insn, src1);
663    return insn;
664 }
665 
666 static int
get_3src_subreg_nr(struct brw_reg reg)667 get_3src_subreg_nr(struct brw_reg reg)
668 {
669    /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
670     * use 32-bit units (components 0..7).  Since they only support F/D/UD
671     * types, this doesn't lose any flexibility, but uses fewer bits.
672     */
673    return reg.subnr / 4;
674 }
675 
676 static enum gen10_align1_3src_vertical_stride
to_3src_align1_vstride(enum brw_vertical_stride vstride)677 to_3src_align1_vstride(enum brw_vertical_stride vstride)
678 {
679    switch (vstride) {
680    case BRW_VERTICAL_STRIDE_0:
681       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
682    case BRW_VERTICAL_STRIDE_2:
683       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
684    case BRW_VERTICAL_STRIDE_4:
685       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
686    case BRW_VERTICAL_STRIDE_8:
687    case BRW_VERTICAL_STRIDE_16:
688       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
689    default:
690       unreachable("invalid vstride");
691    }
692 }
693 
694 
695 static enum gen10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum brw_horizontal_stride hstride)696 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
697 {
698    switch (hstride) {
699    case BRW_HORIZONTAL_STRIDE_0:
700       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
701    case BRW_HORIZONTAL_STRIDE_1:
702       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
703    case BRW_HORIZONTAL_STRIDE_2:
704       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
705    case BRW_HORIZONTAL_STRIDE_4:
706       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
707    default:
708       unreachable("invalid hstride");
709    }
710 }
711 
712 static brw_inst *
brw_alu3(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)713 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
714          struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
715 {
716    const struct gen_device_info *devinfo = p->devinfo;
717    brw_inst *inst = next_insn(p, opcode);
718 
719    gen7_convert_mrf_to_grf(p, &dest);
720 
721    assert(dest.nr < 128);
722    assert(src0.nr < 128);
723    assert(src1.nr < 128);
724    assert(src2.nr < 128);
725    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
726    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
727    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
728    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
729 
730    if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
731       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
732              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
733 
734       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
735          brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
736                                            BRW_ALIGN1_3SRC_ACCUMULATOR);
737          brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
738       } else {
739          brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
740                                            BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
741          brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
742       }
743       brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
744 
745       brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
746 
747       if (brw_reg_type_is_floating_point(dest.type)) {
748          brw_inst_set_3src_a1_exec_type(devinfo, inst,
749                                         BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
750       } else {
751          brw_inst_set_3src_a1_exec_type(devinfo, inst,
752                                         BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
753       }
754 
755       brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
756       brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
757       brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
758       brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
759 
760       brw_inst_set_3src_a1_src0_vstride(devinfo, inst,
761                                         to_3src_align1_vstride(src0.vstride));
762       brw_inst_set_3src_a1_src1_vstride(devinfo, inst,
763                                         to_3src_align1_vstride(src1.vstride));
764       /* no vstride on src2 */
765 
766       brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
767                                         to_3src_align1_hstride(src0.hstride));
768       brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
769                                         to_3src_align1_hstride(src1.hstride));
770       brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
771                                         to_3src_align1_hstride(src2.hstride));
772 
773       brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
774       brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
775       brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
776       brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
777 
778       brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
779       if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
780          brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
781       } else {
782          brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
783       }
784       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
785       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
786 
787       brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
788       brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
789       brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
790       brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
791 
792       assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
793              src0.file == BRW_IMMEDIATE_VALUE);
794       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
795              src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
796       assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
797              src2.file == BRW_IMMEDIATE_VALUE);
798 
799       brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
800                                          src0.file == BRW_GENERAL_REGISTER_FILE ?
801                                          BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
802                                          BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
803       brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
804                                          src1.file == BRW_GENERAL_REGISTER_FILE ?
805                                          BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
806                                          BRW_ALIGN1_3SRC_ACCUMULATOR);
807       brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
808                                          src2.file == BRW_GENERAL_REGISTER_FILE ?
809                                          BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
810                                          BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
811    } else {
812       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
813              dest.file == BRW_MESSAGE_REGISTER_FILE);
814       assert(dest.type == BRW_REGISTER_TYPE_F  ||
815              dest.type == BRW_REGISTER_TYPE_DF ||
816              dest.type == BRW_REGISTER_TYPE_D  ||
817              dest.type == BRW_REGISTER_TYPE_UD);
818       if (devinfo->gen == 6) {
819          brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
820                                             dest.file == BRW_MESSAGE_REGISTER_FILE);
821       }
822       brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
823       brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
824       brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
825 
826       assert(src0.file == BRW_GENERAL_REGISTER_FILE);
827       brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
828       brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
829       brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
830       brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
831       brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
832       brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
833                                           src0.vstride == BRW_VERTICAL_STRIDE_0);
834 
835       assert(src1.file == BRW_GENERAL_REGISTER_FILE);
836       brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
837       brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
838       brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
839       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
840       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
841       brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
842                                           src1.vstride == BRW_VERTICAL_STRIDE_0);
843 
844       assert(src2.file == BRW_GENERAL_REGISTER_FILE);
845       brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
846       brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
847       brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
848       brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
849       brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
850       brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
851                                           src2.vstride == BRW_VERTICAL_STRIDE_0);
852 
853       if (devinfo->gen >= 7) {
854          /* Set both the source and destination types based on dest.type,
855           * ignoring the source register types.  The MAD and LRP emitters ensure
856           * that all four types are float.  The BFE and BFI2 emitters, however,
857           * may send us mixed D and UD types and want us to ignore that and use
858           * the destination type.
859           */
860          brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
861          brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
862       }
863    }
864 
865    return inst;
866 }
867 
868 
869 /***********************************************************************
870  * Convenience routines.
871  */
872 #define ALU1(OP)					\
873 brw_inst *brw_##OP(struct brw_codegen *p,		\
874 	      struct brw_reg dest,			\
875 	      struct brw_reg src0)   			\
876 {							\
877    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
878 }
879 
880 #define ALU2(OP)					\
881 brw_inst *brw_##OP(struct brw_codegen *p,		\
882 	      struct brw_reg dest,			\
883 	      struct brw_reg src0,			\
884 	      struct brw_reg src1)   			\
885 {							\
886    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
887 }
888 
889 #define ALU3(OP)					\
890 brw_inst *brw_##OP(struct brw_codegen *p,		\
891 	      struct brw_reg dest,			\
892 	      struct brw_reg src0,			\
893 	      struct brw_reg src1,			\
894 	      struct brw_reg src2)   			\
895 {							\
896    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
897 }
898 
899 #define ALU3F(OP)                                               \
900 brw_inst *brw_##OP(struct brw_codegen *p,         \
901                                  struct brw_reg dest,           \
902                                  struct brw_reg src0,           \
903                                  struct brw_reg src1,           \
904                                  struct brw_reg src2)           \
905 {                                                               \
906    assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
907           dest.type == BRW_REGISTER_TYPE_DF);                   \
908    if (dest.type == BRW_REGISTER_TYPE_F) {                      \
909       assert(src0.type == BRW_REGISTER_TYPE_F);                 \
910       assert(src1.type == BRW_REGISTER_TYPE_F);                 \
911       assert(src2.type == BRW_REGISTER_TYPE_F);                 \
912    } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
913       assert(src0.type == BRW_REGISTER_TYPE_DF);                \
914       assert(src1.type == BRW_REGISTER_TYPE_DF);                \
915       assert(src2.type == BRW_REGISTER_TYPE_DF);                \
916    }                                                            \
917    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
918 }
919 
920 /* Rounding operations (other than RNDD) require two instructions - the first
921  * stores a rounded value (possibly the wrong way) in the dest register, but
922  * also sets a per-channel "increment bit" in the flag register.  A predicated
923  * add of 1.0 fixes dest to contain the desired result.
924  *
925  * Sandybridge and later appear to round correctly without an ADD.
926  */
927 #define ROUND(OP)							      \
928 void brw_##OP(struct brw_codegen *p,					      \
929 	      struct brw_reg dest,					      \
930 	      struct brw_reg src)					      \
931 {									      \
932    const struct gen_device_info *devinfo = p->devinfo;					      \
933    brw_inst *rnd, *add;							      \
934    rnd = next_insn(p, BRW_OPCODE_##OP);					      \
935    brw_set_dest(p, rnd, dest);						      \
936    brw_set_src0(p, rnd, src);						      \
937 									      \
938    if (devinfo->gen < 6) {							      \
939       /* turn on round-increments */					      \
940       brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R);            \
941       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
942       brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL);          \
943    }									      \
944 }
945 
946 
947 ALU2(SEL)
ALU1(NOT)948 ALU1(NOT)
949 ALU2(AND)
950 ALU2(OR)
951 ALU2(XOR)
952 ALU2(SHR)
953 ALU2(SHL)
954 ALU1(DIM)
955 ALU2(ASR)
956 ALU1(FRC)
957 ALU1(RNDD)
958 ALU2(MAC)
959 ALU2(MACH)
960 ALU1(LZD)
961 ALU2(DP4)
962 ALU2(DPH)
963 ALU2(DP3)
964 ALU2(DP2)
965 ALU3F(MAD)
966 ALU3F(LRP)
967 ALU1(BFREV)
968 ALU3(BFE)
969 ALU2(BFI1)
970 ALU3(BFI2)
971 ALU1(FBH)
972 ALU1(FBL)
973 ALU1(CBIT)
974 ALU2(ADDC)
975 ALU2(SUBB)
976 
977 ROUND(RNDZ)
978 ROUND(RNDE)
979 
980 brw_inst *
981 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
982 {
983    const struct gen_device_info *devinfo = p->devinfo;
984 
985    /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
986     * To avoid the problems that causes, we use a <1,2,0> source region to read
987     * each element twice.
988     */
989    if (devinfo->gen == 7 && !devinfo->is_haswell &&
990        brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1 &&
991        dest.type == BRW_REGISTER_TYPE_DF &&
992        (src0.type == BRW_REGISTER_TYPE_F ||
993         src0.type == BRW_REGISTER_TYPE_D ||
994         src0.type == BRW_REGISTER_TYPE_UD) &&
995        !has_scalar_region(src0)) {
996       assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
997              src0.width == BRW_WIDTH_4 &&
998              src0.hstride == BRW_HORIZONTAL_STRIDE_1);
999 
1000       src0.vstride = BRW_VERTICAL_STRIDE_1;
1001       src0.width = BRW_WIDTH_2;
1002       src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1003    }
1004 
1005    return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1006 }
1007 
1008 brw_inst *
brw_ADD(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1009 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1010         struct brw_reg src0, struct brw_reg src1)
1011 {
1012    /* 6.2.2: add */
1013    if (src0.type == BRW_REGISTER_TYPE_F ||
1014        (src0.file == BRW_IMMEDIATE_VALUE &&
1015 	src0.type == BRW_REGISTER_TYPE_VF)) {
1016       assert(src1.type != BRW_REGISTER_TYPE_UD);
1017       assert(src1.type != BRW_REGISTER_TYPE_D);
1018    }
1019 
1020    if (src1.type == BRW_REGISTER_TYPE_F ||
1021        (src1.file == BRW_IMMEDIATE_VALUE &&
1022 	src1.type == BRW_REGISTER_TYPE_VF)) {
1023       assert(src0.type != BRW_REGISTER_TYPE_UD);
1024       assert(src0.type != BRW_REGISTER_TYPE_D);
1025    }
1026 
1027    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1028 }
1029 
1030 brw_inst *
brw_AVG(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1031 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1032         struct brw_reg src0, struct brw_reg src1)
1033 {
1034    assert(dest.type == src0.type);
1035    assert(src0.type == src1.type);
1036    switch (src0.type) {
1037    case BRW_REGISTER_TYPE_B:
1038    case BRW_REGISTER_TYPE_UB:
1039    case BRW_REGISTER_TYPE_W:
1040    case BRW_REGISTER_TYPE_UW:
1041    case BRW_REGISTER_TYPE_D:
1042    case BRW_REGISTER_TYPE_UD:
1043       break;
1044    default:
1045       unreachable("Bad type for brw_AVG");
1046    }
1047 
1048    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1049 }
1050 
1051 brw_inst *
brw_MUL(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1052 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1053         struct brw_reg src0, struct brw_reg src1)
1054 {
1055    /* 6.32.38: mul */
1056    if (src0.type == BRW_REGISTER_TYPE_D ||
1057        src0.type == BRW_REGISTER_TYPE_UD ||
1058        src1.type == BRW_REGISTER_TYPE_D ||
1059        src1.type == BRW_REGISTER_TYPE_UD) {
1060       assert(dest.type != BRW_REGISTER_TYPE_F);
1061    }
1062 
1063    if (src0.type == BRW_REGISTER_TYPE_F ||
1064        (src0.file == BRW_IMMEDIATE_VALUE &&
1065 	src0.type == BRW_REGISTER_TYPE_VF)) {
1066       assert(src1.type != BRW_REGISTER_TYPE_UD);
1067       assert(src1.type != BRW_REGISTER_TYPE_D);
1068    }
1069 
1070    if (src1.type == BRW_REGISTER_TYPE_F ||
1071        (src1.file == BRW_IMMEDIATE_VALUE &&
1072 	src1.type == BRW_REGISTER_TYPE_VF)) {
1073       assert(src0.type != BRW_REGISTER_TYPE_UD);
1074       assert(src0.type != BRW_REGISTER_TYPE_D);
1075    }
1076 
1077    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1078 	  src0.nr != BRW_ARF_ACCUMULATOR);
1079    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1080 	  src1.nr != BRW_ARF_ACCUMULATOR);
1081 
1082    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1083 }
1084 
1085 brw_inst *
brw_LINE(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1086 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1087          struct brw_reg src0, struct brw_reg src1)
1088 {
1089    src0.vstride = BRW_VERTICAL_STRIDE_0;
1090    src0.width = BRW_WIDTH_1;
1091    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1092    return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1093 }
1094 
1095 brw_inst *
brw_PLN(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1096 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1097         struct brw_reg src0, struct brw_reg src1)
1098 {
1099    src0.vstride = BRW_VERTICAL_STRIDE_0;
1100    src0.width = BRW_WIDTH_1;
1101    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1102    src1.vstride = BRW_VERTICAL_STRIDE_8;
1103    src1.width = BRW_WIDTH_8;
1104    src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1105    return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1106 }
1107 
1108 brw_inst *
brw_F32TO16(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1109 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1110 {
1111    const struct gen_device_info *devinfo = p->devinfo;
1112    const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1113    /* The F32TO16 instruction doesn't support 32-bit destination types in
1114     * Align1 mode, and neither does the Gen8 implementation in terms of a
1115     * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
1116     * an undocumented feature.
1117     */
1118    const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1119                                  (!align16 || devinfo->gen >= 8));
1120    brw_inst *inst;
1121 
1122    if (align16) {
1123       assert(dst.type == BRW_REGISTER_TYPE_UD);
1124    } else {
1125       assert(dst.type == BRW_REGISTER_TYPE_UD ||
1126              dst.type == BRW_REGISTER_TYPE_W ||
1127              dst.type == BRW_REGISTER_TYPE_UW ||
1128              dst.type == BRW_REGISTER_TYPE_HF);
1129    }
1130 
1131    brw_push_insn_state(p);
1132 
1133    if (needs_zero_fill) {
1134       brw_set_default_access_mode(p, BRW_ALIGN_1);
1135       dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1136    }
1137 
1138    if (devinfo->gen >= 8) {
1139       inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1140    } else {
1141       assert(devinfo->gen == 7);
1142       inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1143    }
1144 
1145    if (needs_zero_fill) {
1146       brw_inst_set_no_dd_clear(devinfo, inst, true);
1147       inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1148       brw_inst_set_no_dd_check(devinfo, inst, true);
1149    }
1150 
1151    brw_pop_insn_state(p);
1152    return inst;
1153 }
1154 
1155 brw_inst *
brw_F16TO32(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1156 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1157 {
1158    const struct gen_device_info *devinfo = p->devinfo;
1159    bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1160 
1161    if (align16) {
1162       assert(src.type == BRW_REGISTER_TYPE_UD);
1163    } else {
1164       /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1165        *
1166        *   Because this instruction does not have a 16-bit floating-point
1167        *   type, the source data type must be Word (W). The destination type
1168        *   must be F (Float).
1169        */
1170       if (src.type == BRW_REGISTER_TYPE_UD)
1171          src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1172 
1173       assert(src.type == BRW_REGISTER_TYPE_W ||
1174              src.type == BRW_REGISTER_TYPE_UW ||
1175              src.type == BRW_REGISTER_TYPE_HF);
1176    }
1177 
1178    if (devinfo->gen >= 8) {
1179       return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1180    } else {
1181       assert(devinfo->gen == 7);
1182       return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1183    }
1184 }
1185 
1186 
brw_NOP(struct brw_codegen * p)1187 void brw_NOP(struct brw_codegen *p)
1188 {
1189    brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1190    memset(insn, 0, sizeof(*insn));
1191    brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1192 }
1193 
1194 
1195 
1196 
1197 
1198 /***********************************************************************
1199  * Comparisons, if/else/endif
1200  */
1201 
1202 brw_inst *
brw_JMPI(struct brw_codegen * p,struct brw_reg index,unsigned predicate_control)1203 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1204          unsigned predicate_control)
1205 {
1206    const struct gen_device_info *devinfo = p->devinfo;
1207    struct brw_reg ip = brw_ip_reg();
1208    brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1209 
1210    brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1211    brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1212    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1213    brw_inst_set_pred_control(devinfo, inst, predicate_control);
1214 
1215    return inst;
1216 }
1217 
1218 static void
push_if_stack(struct brw_codegen * p,brw_inst * inst)1219 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1220 {
1221    p->if_stack[p->if_stack_depth] = inst - p->store;
1222 
1223    p->if_stack_depth++;
1224    if (p->if_stack_array_size <= p->if_stack_depth) {
1225       p->if_stack_array_size *= 2;
1226       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1227 			     p->if_stack_array_size);
1228    }
1229 }
1230 
1231 static brw_inst *
pop_if_stack(struct brw_codegen * p)1232 pop_if_stack(struct brw_codegen *p)
1233 {
1234    p->if_stack_depth--;
1235    return &p->store[p->if_stack[p->if_stack_depth]];
1236 }
1237 
1238 static void
push_loop_stack(struct brw_codegen * p,brw_inst * inst)1239 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1240 {
1241    if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1242       p->loop_stack_array_size *= 2;
1243       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1244 			       p->loop_stack_array_size);
1245       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1246 				     p->loop_stack_array_size);
1247    }
1248 
1249    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1250    p->loop_stack_depth++;
1251    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1252 }
1253 
1254 static brw_inst *
get_inner_do_insn(struct brw_codegen * p)1255 get_inner_do_insn(struct brw_codegen *p)
1256 {
1257    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1258 }
1259 
1260 /* EU takes the value from the flag register and pushes it onto some
1261  * sort of a stack (presumably merging with any flag value already on
1262  * the stack).  Within an if block, the flags at the top of the stack
1263  * control execution on each channel of the unit, eg. on each of the
1264  * 16 pixel values in our wm programs.
1265  *
1266  * When the matching 'else' instruction is reached (presumably by
1267  * countdown of the instruction count patched in by our ELSE/ENDIF
1268  * functions), the relevant flags are inverted.
1269  *
1270  * When the matching 'endif' instruction is reached, the flags are
1271  * popped off.  If the stack is now empty, normal execution resumes.
1272  */
1273 brw_inst *
brw_IF(struct brw_codegen * p,unsigned execute_size)1274 brw_IF(struct brw_codegen *p, unsigned execute_size)
1275 {
1276    const struct gen_device_info *devinfo = p->devinfo;
1277    brw_inst *insn;
1278 
1279    insn = next_insn(p, BRW_OPCODE_IF);
1280 
1281    /* Override the defaults for this instruction:
1282     */
1283    if (devinfo->gen < 6) {
1284       brw_set_dest(p, insn, brw_ip_reg());
1285       brw_set_src0(p, insn, brw_ip_reg());
1286       brw_set_src1(p, insn, brw_imm_d(0x0));
1287    } else if (devinfo->gen == 6) {
1288       brw_set_dest(p, insn, brw_imm_w(0));
1289       brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1290       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1291       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1292    } else if (devinfo->gen == 7) {
1293       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1294       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1295       brw_set_src1(p, insn, brw_imm_w(0));
1296       brw_inst_set_jip(devinfo, insn, 0);
1297       brw_inst_set_uip(devinfo, insn, 0);
1298    } else {
1299       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1300       brw_set_src0(p, insn, brw_imm_d(0));
1301       brw_inst_set_jip(devinfo, insn, 0);
1302       brw_inst_set_uip(devinfo, insn, 0);
1303    }
1304 
1305    brw_inst_set_exec_size(devinfo, insn, execute_size);
1306    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1307    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1308    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1309    if (!p->single_program_flow && devinfo->gen < 6)
1310       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1311 
1312    push_if_stack(p, insn);
1313    p->if_depth_in_loop[p->loop_stack_depth]++;
1314    return insn;
1315 }
1316 
1317 /* This function is only used for gen6-style IF instructions with an
1318  * embedded comparison (conditional modifier).  It is not used on gen7.
1319  */
1320 brw_inst *
gen6_IF(struct brw_codegen * p,enum brw_conditional_mod conditional,struct brw_reg src0,struct brw_reg src1)1321 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1322 	struct brw_reg src0, struct brw_reg src1)
1323 {
1324    const struct gen_device_info *devinfo = p->devinfo;
1325    brw_inst *insn;
1326 
1327    insn = next_insn(p, BRW_OPCODE_IF);
1328 
1329    brw_set_dest(p, insn, brw_imm_w(0));
1330    brw_inst_set_exec_size(devinfo, insn,
1331                           brw_inst_exec_size(devinfo, p->current));
1332    brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1333    brw_set_src0(p, insn, src0);
1334    brw_set_src1(p, insn, src1);
1335 
1336    assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1337    assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1338    brw_inst_set_cond_modifier(devinfo, insn, conditional);
1339 
1340    push_if_stack(p, insn);
1341    return insn;
1342 }
1343 
1344 /**
1345  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1346  */
1347 static void
convert_IF_ELSE_to_ADD(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst)1348 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1349                        brw_inst *if_inst, brw_inst *else_inst)
1350 {
1351    const struct gen_device_info *devinfo = p->devinfo;
1352 
1353    /* The next instruction (where the ENDIF would be, if it existed) */
1354    brw_inst *next_inst = &p->store[p->nr_insn];
1355 
1356    assert(p->single_program_flow);
1357    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1358    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1359    assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1360 
1361    /* Convert IF to an ADD instruction that moves the instruction pointer
1362     * to the first instruction of the ELSE block.  If there is no ELSE
1363     * block, point to where ENDIF would be.  Reverse the predicate.
1364     *
1365     * There's no need to execute an ENDIF since we don't need to do any
1366     * stack operations, and if we're currently executing, we just want to
1367     * continue normally.
1368     */
1369    brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1370    brw_inst_set_pred_inv(devinfo, if_inst, true);
1371 
1372    if (else_inst != NULL) {
1373       /* Convert ELSE to an ADD instruction that points where the ENDIF
1374        * would be.
1375        */
1376       brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1377 
1378       brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1379       brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1380    } else {
1381       brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1382    }
1383 }
1384 
1385 /**
1386  * Patch IF and ELSE instructions with appropriate jump targets.
1387  */
1388 static void
patch_IF_ELSE(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst,brw_inst * endif_inst)1389 patch_IF_ELSE(struct brw_codegen *p,
1390               brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1391 {
1392    const struct gen_device_info *devinfo = p->devinfo;
1393 
1394    /* We shouldn't be patching IF and ELSE instructions in single program flow
1395     * mode when gen < 6, because in single program flow mode on those
1396     * platforms, we convert flow control instructions to conditional ADDs that
1397     * operate on IP (see brw_ENDIF).
1398     *
1399     * However, on Gen6, writing to IP doesn't work in single program flow mode
1400     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1401     * not be updated by non-flow control instructions.").  And on later
1402     * platforms, there is no significant benefit to converting control flow
1403     * instructions to conditional ADDs.  So we do patch IF and ELSE
1404     * instructions in single program flow mode on those platforms.
1405     */
1406    if (devinfo->gen < 6)
1407       assert(!p->single_program_flow);
1408 
1409    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1410    assert(endif_inst != NULL);
1411    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1412 
1413    unsigned br = brw_jump_scale(devinfo);
1414 
1415    assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1416    brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1417 
1418    if (else_inst == NULL) {
1419       /* Patch IF -> ENDIF */
1420       if (devinfo->gen < 6) {
1421 	 /* Turn it into an IFF, which means no mask stack operations for
1422 	  * all-false and jumping past the ENDIF.
1423 	  */
1424          brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1425          brw_inst_set_gen4_jump_count(devinfo, if_inst,
1426                                       br * (endif_inst - if_inst + 1));
1427          brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1428       } else if (devinfo->gen == 6) {
1429 	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1430          brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1431       } else {
1432          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1433          brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1434       }
1435    } else {
1436       brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1437 
1438       /* Patch IF -> ELSE */
1439       if (devinfo->gen < 6) {
1440          brw_inst_set_gen4_jump_count(devinfo, if_inst,
1441                                       br * (else_inst - if_inst));
1442          brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1443       } else if (devinfo->gen == 6) {
1444          brw_inst_set_gen6_jump_count(devinfo, if_inst,
1445                                       br * (else_inst - if_inst + 1));
1446       }
1447 
1448       /* Patch ELSE -> ENDIF */
1449       if (devinfo->gen < 6) {
1450 	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1451 	  * matching ENDIF.
1452 	  */
1453          brw_inst_set_gen4_jump_count(devinfo, else_inst,
1454                                       br * (endif_inst - else_inst + 1));
1455          brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1456       } else if (devinfo->gen == 6) {
1457 	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1458          brw_inst_set_gen6_jump_count(devinfo, else_inst,
1459                                       br * (endif_inst - else_inst));
1460       } else {
1461 	 /* The IF instruction's JIP should point just past the ELSE */
1462          brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1463 	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1464          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1465          brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1466          if (devinfo->gen >= 8) {
1467             /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1468              * should point to ENDIF.
1469              */
1470             brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1471          }
1472       }
1473    }
1474 }
1475 
1476 void
brw_ELSE(struct brw_codegen * p)1477 brw_ELSE(struct brw_codegen *p)
1478 {
1479    const struct gen_device_info *devinfo = p->devinfo;
1480    brw_inst *insn;
1481 
1482    insn = next_insn(p, BRW_OPCODE_ELSE);
1483 
1484    if (devinfo->gen < 6) {
1485       brw_set_dest(p, insn, brw_ip_reg());
1486       brw_set_src0(p, insn, brw_ip_reg());
1487       brw_set_src1(p, insn, brw_imm_d(0x0));
1488    } else if (devinfo->gen == 6) {
1489       brw_set_dest(p, insn, brw_imm_w(0));
1490       brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1491       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1492       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1493    } else if (devinfo->gen == 7) {
1494       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1495       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1496       brw_set_src1(p, insn, brw_imm_w(0));
1497       brw_inst_set_jip(devinfo, insn, 0);
1498       brw_inst_set_uip(devinfo, insn, 0);
1499    } else {
1500       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1501       brw_set_src0(p, insn, brw_imm_d(0));
1502       brw_inst_set_jip(devinfo, insn, 0);
1503       brw_inst_set_uip(devinfo, insn, 0);
1504    }
1505 
1506    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1507    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1508    if (!p->single_program_flow && devinfo->gen < 6)
1509       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1510 
1511    push_if_stack(p, insn);
1512 }
1513 
1514 void
brw_ENDIF(struct brw_codegen * p)1515 brw_ENDIF(struct brw_codegen *p)
1516 {
1517    const struct gen_device_info *devinfo = p->devinfo;
1518    brw_inst *insn = NULL;
1519    brw_inst *else_inst = NULL;
1520    brw_inst *if_inst = NULL;
1521    brw_inst *tmp;
1522    bool emit_endif = true;
1523 
1524    /* In single program flow mode, we can express IF and ELSE instructions
1525     * equivalently as ADD instructions that operate on IP.  On platforms prior
1526     * to Gen6, flow control instructions cause an implied thread switch, so
1527     * this is a significant savings.
1528     *
1529     * However, on Gen6, writing to IP doesn't work in single program flow mode
1530     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1531     * not be updated by non-flow control instructions.").  And on later
1532     * platforms, there is no significant benefit to converting control flow
1533     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1534     * Gen5.
1535     */
1536    if (devinfo->gen < 6 && p->single_program_flow)
1537       emit_endif = false;
1538 
1539    /*
1540     * A single next_insn() may change the base address of instruction store
1541     * memory(p->store), so call it first before referencing the instruction
1542     * store pointer from an index
1543     */
1544    if (emit_endif)
1545       insn = next_insn(p, BRW_OPCODE_ENDIF);
1546 
1547    /* Pop the IF and (optional) ELSE instructions from the stack */
1548    p->if_depth_in_loop[p->loop_stack_depth]--;
1549    tmp = pop_if_stack(p);
1550    if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1551       else_inst = tmp;
1552       tmp = pop_if_stack(p);
1553    }
1554    if_inst = tmp;
1555 
1556    if (!emit_endif) {
1557       /* ENDIF is useless; don't bother emitting it. */
1558       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1559       return;
1560    }
1561 
1562    if (devinfo->gen < 6) {
1563       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1564       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1565       brw_set_src1(p, insn, brw_imm_d(0x0));
1566    } else if (devinfo->gen == 6) {
1567       brw_set_dest(p, insn, brw_imm_w(0));
1568       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1569       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1570    } else if (devinfo->gen == 7) {
1571       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1572       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1573       brw_set_src1(p, insn, brw_imm_w(0));
1574    } else {
1575       brw_set_src0(p, insn, brw_imm_d(0));
1576    }
1577 
1578    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1579    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1580    if (devinfo->gen < 6)
1581       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1582 
1583    /* Also pop item off the stack in the endif instruction: */
1584    if (devinfo->gen < 6) {
1585       brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1586       brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1587    } else if (devinfo->gen == 6) {
1588       brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1589    } else {
1590       brw_inst_set_jip(devinfo, insn, 2);
1591    }
1592    patch_IF_ELSE(p, if_inst, else_inst, insn);
1593 }
1594 
1595 brw_inst *
brw_BREAK(struct brw_codegen * p)1596 brw_BREAK(struct brw_codegen *p)
1597 {
1598    const struct gen_device_info *devinfo = p->devinfo;
1599    brw_inst *insn;
1600 
1601    insn = next_insn(p, BRW_OPCODE_BREAK);
1602    if (devinfo->gen >= 8) {
1603       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1604       brw_set_src0(p, insn, brw_imm_d(0x0));
1605    } else if (devinfo->gen >= 6) {
1606       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1607       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1608       brw_set_src1(p, insn, brw_imm_d(0x0));
1609    } else {
1610       brw_set_dest(p, insn, brw_ip_reg());
1611       brw_set_src0(p, insn, brw_ip_reg());
1612       brw_set_src1(p, insn, brw_imm_d(0x0));
1613       brw_inst_set_gen4_pop_count(devinfo, insn,
1614                                   p->if_depth_in_loop[p->loop_stack_depth]);
1615    }
1616    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1617    brw_inst_set_exec_size(devinfo, insn,
1618                           brw_inst_exec_size(devinfo, p->current));
1619 
1620    return insn;
1621 }
1622 
1623 brw_inst *
brw_CONT(struct brw_codegen * p)1624 brw_CONT(struct brw_codegen *p)
1625 {
1626    const struct gen_device_info *devinfo = p->devinfo;
1627    brw_inst *insn;
1628 
1629    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1630    brw_set_dest(p, insn, brw_ip_reg());
1631    if (devinfo->gen >= 8) {
1632       brw_set_src0(p, insn, brw_imm_d(0x0));
1633    } else {
1634       brw_set_src0(p, insn, brw_ip_reg());
1635       brw_set_src1(p, insn, brw_imm_d(0x0));
1636    }
1637 
1638    if (devinfo->gen < 6) {
1639       brw_inst_set_gen4_pop_count(devinfo, insn,
1640                                   p->if_depth_in_loop[p->loop_stack_depth]);
1641    }
1642    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1643    brw_inst_set_exec_size(devinfo, insn,
1644                           brw_inst_exec_size(devinfo, p->current));
1645    return insn;
1646 }
1647 
1648 brw_inst *
gen6_HALT(struct brw_codegen * p)1649 gen6_HALT(struct brw_codegen *p)
1650 {
1651    const struct gen_device_info *devinfo = p->devinfo;
1652    brw_inst *insn;
1653 
1654    insn = next_insn(p, BRW_OPCODE_HALT);
1655    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1656    if (devinfo->gen >= 8) {
1657       brw_set_src0(p, insn, brw_imm_d(0x0));
1658    } else {
1659       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1660       brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1661    }
1662 
1663    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1664    brw_inst_set_exec_size(devinfo, insn,
1665                           brw_inst_exec_size(devinfo, p->current));
1666    return insn;
1667 }
1668 
1669 /* DO/WHILE loop:
1670  *
1671  * The DO/WHILE is just an unterminated loop -- break or continue are
1672  * used for control within the loop.  We have a few ways they can be
1673  * done.
1674  *
1675  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1676  * jip and no DO instruction.
1677  *
1678  * For non-uniform control flow pre-gen6, there's a DO instruction to
1679  * push the mask, and a WHILE to jump back, and BREAK to get out and
1680  * pop the mask.
1681  *
1682  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1683  * just points back to the first instruction of the loop.
1684  */
1685 brw_inst *
brw_DO(struct brw_codegen * p,unsigned execute_size)1686 brw_DO(struct brw_codegen *p, unsigned execute_size)
1687 {
1688    const struct gen_device_info *devinfo = p->devinfo;
1689 
1690    if (devinfo->gen >= 6 || p->single_program_flow) {
1691       push_loop_stack(p, &p->store[p->nr_insn]);
1692       return &p->store[p->nr_insn];
1693    } else {
1694       brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1695 
1696       push_loop_stack(p, insn);
1697 
1698       /* Override the defaults for this instruction:
1699        */
1700       brw_set_dest(p, insn, brw_null_reg());
1701       brw_set_src0(p, insn, brw_null_reg());
1702       brw_set_src1(p, insn, brw_null_reg());
1703 
1704       brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1705       brw_inst_set_exec_size(devinfo, insn, execute_size);
1706       brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1707 
1708       return insn;
1709    }
1710 }
1711 
1712 /**
1713  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1714  * instruction here.
1715  *
1716  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1717  * nesting, since it can always just point to the end of the block/current loop.
1718  */
1719 static void
brw_patch_break_cont(struct brw_codegen * p,brw_inst * while_inst)1720 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1721 {
1722    const struct gen_device_info *devinfo = p->devinfo;
1723    brw_inst *do_inst = get_inner_do_insn(p);
1724    brw_inst *inst;
1725    unsigned br = brw_jump_scale(devinfo);
1726 
1727    assert(devinfo->gen < 6);
1728 
1729    for (inst = while_inst - 1; inst != do_inst; inst--) {
1730       /* If the jump count is != 0, that means that this instruction has already
1731        * been patched because it's part of a loop inside of the one we're
1732        * patching.
1733        */
1734       if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1735           brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1736          brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1737       } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1738                  brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1739          brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1740       }
1741    }
1742 }
1743 
1744 brw_inst *
brw_WHILE(struct brw_codegen * p)1745 brw_WHILE(struct brw_codegen *p)
1746 {
1747    const struct gen_device_info *devinfo = p->devinfo;
1748    brw_inst *insn, *do_insn;
1749    unsigned br = brw_jump_scale(devinfo);
1750 
1751    if (devinfo->gen >= 6) {
1752       insn = next_insn(p, BRW_OPCODE_WHILE);
1753       do_insn = get_inner_do_insn(p);
1754 
1755       if (devinfo->gen >= 8) {
1756          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1757          brw_set_src0(p, insn, brw_imm_d(0));
1758          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1759       } else if (devinfo->gen == 7) {
1760          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1761          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1762          brw_set_src1(p, insn, brw_imm_w(0));
1763          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1764       } else {
1765          brw_set_dest(p, insn, brw_imm_w(0));
1766          brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1767          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1768          brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1769       }
1770 
1771       brw_inst_set_exec_size(devinfo, insn,
1772                              brw_inst_exec_size(devinfo, p->current));
1773 
1774    } else {
1775       if (p->single_program_flow) {
1776 	 insn = next_insn(p, BRW_OPCODE_ADD);
1777          do_insn = get_inner_do_insn(p);
1778 
1779 	 brw_set_dest(p, insn, brw_ip_reg());
1780 	 brw_set_src0(p, insn, brw_ip_reg());
1781 	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1782          brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1783       } else {
1784 	 insn = next_insn(p, BRW_OPCODE_WHILE);
1785          do_insn = get_inner_do_insn(p);
1786 
1787          assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1788 
1789 	 brw_set_dest(p, insn, brw_ip_reg());
1790 	 brw_set_src0(p, insn, brw_ip_reg());
1791 	 brw_set_src1(p, insn, brw_imm_d(0));
1792 
1793          brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1794          brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1795          brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1796 
1797 	 brw_patch_break_cont(p, insn);
1798       }
1799    }
1800    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1801 
1802    p->loop_stack_depth--;
1803 
1804    return insn;
1805 }
1806 
1807 /* FORWARD JUMPS:
1808  */
brw_land_fwd_jump(struct brw_codegen * p,int jmp_insn_idx)1809 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1810 {
1811    const struct gen_device_info *devinfo = p->devinfo;
1812    brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1813    unsigned jmpi = 1;
1814 
1815    if (devinfo->gen >= 5)
1816       jmpi = 2;
1817 
1818    assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1819    assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1820 
1821    brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1822                                 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1823 }
1824 
1825 /* To integrate with the above, it makes sense that the comparison
1826  * instruction should populate the flag register.  It might be simpler
1827  * just to use the flag reg for most WM tasks?
1828  */
brw_CMP(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1829 void brw_CMP(struct brw_codegen *p,
1830 	     struct brw_reg dest,
1831 	     unsigned conditional,
1832 	     struct brw_reg src0,
1833 	     struct brw_reg src1)
1834 {
1835    const struct gen_device_info *devinfo = p->devinfo;
1836    brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1837 
1838    brw_inst_set_cond_modifier(devinfo, insn, conditional);
1839    brw_set_dest(p, insn, dest);
1840    brw_set_src0(p, insn, src0);
1841    brw_set_src1(p, insn, src1);
1842 
1843    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1844     * page says:
1845     *    "Any CMP instruction with a null destination must use a {switch}."
1846     *
1847     * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1848     * mentioned on their work-arounds pages.
1849     */
1850    if (devinfo->gen == 7) {
1851       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1852           dest.nr == BRW_ARF_NULL) {
1853          brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1854       }
1855    }
1856 }
1857 
1858 /***********************************************************************
1859  * Helpers for the various SEND message types:
1860  */
1861 
1862 /** Extended math function, float[8].
1863  */
gen4_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,unsigned msg_reg_nr,struct brw_reg src,unsigned precision)1864 void gen4_math(struct brw_codegen *p,
1865 	       struct brw_reg dest,
1866 	       unsigned function,
1867 	       unsigned msg_reg_nr,
1868 	       struct brw_reg src,
1869 	       unsigned precision )
1870 {
1871    const struct gen_device_info *devinfo = p->devinfo;
1872    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1873    unsigned data_type;
1874    if (has_scalar_region(src)) {
1875       data_type = BRW_MATH_DATA_SCALAR;
1876    } else {
1877       data_type = BRW_MATH_DATA_VECTOR;
1878    }
1879 
1880    assert(devinfo->gen < 6);
1881 
1882    /* Example code doesn't set predicate_control for send
1883     * instructions.
1884     */
1885    brw_inst_set_pred_control(devinfo, insn, 0);
1886    brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1887 
1888    brw_set_dest(p, insn, dest);
1889    brw_set_src0(p, insn, src);
1890    brw_set_math_message(p,
1891                         insn,
1892                         function,
1893                         src.type == BRW_REGISTER_TYPE_D,
1894                         precision,
1895                         data_type);
1896 }
1897 
gen6_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)1898 void gen6_math(struct brw_codegen *p,
1899 	       struct brw_reg dest,
1900 	       unsigned function,
1901 	       struct brw_reg src0,
1902 	       struct brw_reg src1)
1903 {
1904    const struct gen_device_info *devinfo = p->devinfo;
1905    brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1906 
1907    assert(devinfo->gen >= 6);
1908 
1909    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1910           (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1911 
1912    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1913    if (devinfo->gen == 6) {
1914       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1915       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1916    }
1917 
1918    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1919        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1920        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1921       assert(src0.type != BRW_REGISTER_TYPE_F);
1922       assert(src1.type != BRW_REGISTER_TYPE_F);
1923       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1924              (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1925    } else {
1926       assert(src0.type == BRW_REGISTER_TYPE_F);
1927       assert(src1.type == BRW_REGISTER_TYPE_F);
1928    }
1929 
1930    /* Source modifiers are ignored for extended math instructions on Gen6. */
1931    if (devinfo->gen == 6) {
1932       assert(!src0.negate);
1933       assert(!src0.abs);
1934       assert(!src1.negate);
1935       assert(!src1.abs);
1936    }
1937 
1938    brw_inst_set_math_function(devinfo, insn, function);
1939 
1940    brw_set_dest(p, insn, dest);
1941    brw_set_src0(p, insn, src0);
1942    brw_set_src1(p, insn, src1);
1943 }
1944 
1945 /**
1946  * Return the right surface index to access the thread scratch space using
1947  * stateless dataport messages.
1948  */
1949 unsigned
brw_scratch_surface_idx(const struct brw_codegen * p)1950 brw_scratch_surface_idx(const struct brw_codegen *p)
1951 {
1952    /* The scratch space is thread-local so IA coherency is unnecessary. */
1953    if (p->devinfo->gen >= 8)
1954       return GEN8_BTI_STATELESS_NON_COHERENT;
1955    else
1956       return BRW_BTI_STATELESS;
1957 }
1958 
1959 /**
1960  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1961  * using a constant offset per channel.
1962  *
1963  * The offset must be aligned to oword size (16 bytes).  Used for
1964  * register spilling.
1965  */
brw_oword_block_write_scratch(struct brw_codegen * p,struct brw_reg mrf,int num_regs,unsigned offset)1966 void brw_oword_block_write_scratch(struct brw_codegen *p,
1967 				   struct brw_reg mrf,
1968 				   int num_regs,
1969 				   unsigned offset)
1970 {
1971    const struct gen_device_info *devinfo = p->devinfo;
1972    const unsigned target_cache =
1973       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1974        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1975        BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
1976    uint32_t msg_type;
1977 
1978    if (devinfo->gen >= 6)
1979       offset /= 16;
1980 
1981    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1982 
1983    const unsigned mlen = 1 + num_regs;
1984 
1985    /* Set up the message header.  This is g0, with g0.2 filled with
1986     * the offset.  We don't want to leave our offset around in g0 or
1987     * it'll screw up texture samples, so set it up inside the message
1988     * reg.
1989     */
1990    {
1991       brw_push_insn_state(p);
1992       brw_set_default_exec_size(p, BRW_EXECUTE_8);
1993       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1994       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1995 
1996       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1997 
1998       /* set message header global offset field (reg 0, element 2) */
1999       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2000       brw_MOV(p,
2001 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2002 				  mrf.nr,
2003 				  2), BRW_REGISTER_TYPE_UD),
2004 	      brw_imm_ud(offset));
2005 
2006       brw_pop_insn_state(p);
2007    }
2008 
2009    {
2010       struct brw_reg dest;
2011       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2012       int send_commit_msg;
2013       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2014 					 BRW_REGISTER_TYPE_UW);
2015 
2016       brw_inst_set_compression(devinfo, insn, false);
2017 
2018       if (brw_inst_exec_size(devinfo, insn) >= 16)
2019 	 src_header = vec16(src_header);
2020 
2021       assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2022       if (devinfo->gen < 6)
2023          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2024 
2025       /* Until gen6, writes followed by reads from the same location
2026        * are not guaranteed to be ordered unless write_commit is set.
2027        * If set, then a no-op write is issued to the destination
2028        * register to set a dependency, and a read from the destination
2029        * can be used to ensure the ordering.
2030        *
2031        * For gen6, only writes between different threads need ordering
2032        * protection.  Our use of DP writes is all about register
2033        * spilling within a thread.
2034        */
2035       if (devinfo->gen >= 6) {
2036 	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2037 	 send_commit_msg = 0;
2038       } else {
2039 	 dest = src_header;
2040 	 send_commit_msg = 1;
2041       }
2042 
2043       brw_set_dest(p, insn, dest);
2044       if (devinfo->gen >= 6) {
2045 	 brw_set_src0(p, insn, mrf);
2046       } else {
2047 	 brw_set_src0(p, insn, brw_null_reg());
2048       }
2049 
2050       if (devinfo->gen >= 6)
2051 	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2052       else
2053 	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2054 
2055       brw_set_dp_write_message(p,
2056 			       insn,
2057                                brw_scratch_surface_idx(p),
2058 			       BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2059 			       msg_type,
2060                                target_cache,
2061 			       mlen,
2062 			       true, /* header_present */
2063 			       0, /* not a render target */
2064 			       send_commit_msg, /* response_length */
2065 			       0, /* eot */
2066 			       send_commit_msg);
2067    }
2068 }
2069 
2070 
2071 /**
2072  * Read a block of owords (half a GRF each) from the scratch buffer
2073  * using a constant index per channel.
2074  *
2075  * Offset must be aligned to oword size (16 bytes).  Used for register
2076  * spilling.
2077  */
2078 void
brw_oword_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,int num_regs,unsigned offset)2079 brw_oword_block_read_scratch(struct brw_codegen *p,
2080 			     struct brw_reg dest,
2081 			     struct brw_reg mrf,
2082 			     int num_regs,
2083 			     unsigned offset)
2084 {
2085    const struct gen_device_info *devinfo = p->devinfo;
2086 
2087    if (devinfo->gen >= 6)
2088       offset /= 16;
2089 
2090    if (p->devinfo->gen >= 7) {
2091       /* On gen 7 and above, we no longer have message registers and we can
2092        * send from any register we want.  By using the destination register
2093        * for the message, we guarantee that the implied message write won't
2094        * accidentally overwrite anything.  This has been a problem because
2095        * the MRF registers and source for the final FB write are both fixed
2096        * and may overlap.
2097        */
2098       mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2099    } else {
2100       mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2101    }
2102    dest = retype(dest, BRW_REGISTER_TYPE_UW);
2103 
2104    const unsigned rlen = num_regs;
2105    const unsigned target_cache =
2106       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2107        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2108        BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2109 
2110    {
2111       brw_push_insn_state(p);
2112       brw_set_default_exec_size(p, BRW_EXECUTE_8);
2113       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2114       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2115 
2116       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2117 
2118       /* set message header global offset field (reg 0, element 2) */
2119       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2120       brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2121 
2122       brw_pop_insn_state(p);
2123    }
2124 
2125    {
2126       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2127 
2128       assert(brw_inst_pred_control(devinfo, insn) == 0);
2129       brw_inst_set_compression(devinfo, insn, false);
2130 
2131       brw_set_dest(p, insn, dest);	/* UW? */
2132       if (devinfo->gen >= 6) {
2133 	 brw_set_src0(p, insn, mrf);
2134       } else {
2135 	 brw_set_src0(p, insn, brw_null_reg());
2136          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2137       }
2138 
2139       brw_set_dp_read_message(p,
2140 			      insn,
2141                               brw_scratch_surface_idx(p),
2142 			      BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2143 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2144 			      target_cache,
2145 			      1, /* msg_length */
2146                               true, /* header_present */
2147 			      rlen);
2148    }
2149 }
2150 
2151 void
gen7_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,int num_regs,unsigned offset)2152 gen7_block_read_scratch(struct brw_codegen *p,
2153                         struct brw_reg dest,
2154                         int num_regs,
2155                         unsigned offset)
2156 {
2157    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2158    assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2159 
2160    brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2161 
2162    /* The HW requires that the header is present; this is to get the g0.5
2163     * scratch offset.
2164     */
2165    brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2166 
2167    /* According to the docs, offset is "A 12-bit HWord offset into the memory
2168     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2169     * is 32 bytes, which happens to be the size of a register.
2170     */
2171    offset /= REG_SIZE;
2172    assert(offset < (1 << 12));
2173 
2174    gen7_set_dp_scratch_message(p, insn,
2175                                false, /* scratch read */
2176                                false, /* OWords */
2177                                false, /* invalidate after read */
2178                                num_regs,
2179                                offset,
2180                                1,        /* mlen: just g0 */
2181                                num_regs, /* rlen */
2182                                true);    /* header present */
2183 }
2184 
2185 /**
2186  * Read float[4] vectors from the data port constant cache.
2187  * Location (in buffer) should be a multiple of 16.
2188  * Used for fetching shader constants.
2189  */
brw_oword_block_read(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,uint32_t offset,uint32_t bind_table_index)2190 void brw_oword_block_read(struct brw_codegen *p,
2191 			  struct brw_reg dest,
2192 			  struct brw_reg mrf,
2193 			  uint32_t offset,
2194 			  uint32_t bind_table_index)
2195 {
2196    const struct gen_device_info *devinfo = p->devinfo;
2197    const unsigned target_cache =
2198       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2199        BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2200    const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
2201 
2202    /* On newer hardware, offset is in units of owords. */
2203    if (devinfo->gen >= 6)
2204       offset /= 16;
2205 
2206    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2207 
2208    brw_push_insn_state(p);
2209    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2210    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2211    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2212 
2213    brw_push_insn_state(p);
2214    brw_set_default_exec_size(p, BRW_EXECUTE_8);
2215    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2216 
2217    /* set message header global offset field (reg 0, element 2) */
2218    brw_set_default_exec_size(p, BRW_EXECUTE_1);
2219    brw_MOV(p,
2220 	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2221 			       mrf.nr,
2222 			       2), BRW_REGISTER_TYPE_UD),
2223 	   brw_imm_ud(offset));
2224    brw_pop_insn_state(p);
2225 
2226    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2227 
2228    /* cast dest to a uword[8] vector */
2229    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2230 
2231    brw_set_dest(p, insn, dest);
2232    if (devinfo->gen >= 6) {
2233       brw_set_src0(p, insn, mrf);
2234    } else {
2235       brw_set_src0(p, insn, brw_null_reg());
2236       brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2237    }
2238 
2239    brw_set_dp_read_message(p, insn, bind_table_index,
2240                            BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2241 			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2242 			   target_cache,
2243 			   1, /* msg_length */
2244                            true, /* header_present */
2245 			   DIV_ROUND_UP(exec_size, 8)); /* response_length */
2246 
2247    brw_pop_insn_state(p);
2248 }
2249 
2250 
brw_fb_WRITE(struct brw_codegen * p,struct brw_reg payload,struct brw_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2251 void brw_fb_WRITE(struct brw_codegen *p,
2252                   struct brw_reg payload,
2253                   struct brw_reg implied_header,
2254                   unsigned msg_control,
2255                   unsigned binding_table_index,
2256                   unsigned msg_length,
2257                   unsigned response_length,
2258                   bool eot,
2259                   bool last_render_target,
2260                   bool header_present)
2261 {
2262    const struct gen_device_info *devinfo = p->devinfo;
2263    const unsigned target_cache =
2264       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2265        BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2266    brw_inst *insn;
2267    unsigned msg_type;
2268    struct brw_reg dest, src0;
2269 
2270    if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2271       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2272    else
2273       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2274 
2275    if (devinfo->gen >= 6) {
2276       insn = next_insn(p, BRW_OPCODE_SENDC);
2277    } else {
2278       insn = next_insn(p, BRW_OPCODE_SEND);
2279    }
2280    brw_inst_set_compression(devinfo, insn, false);
2281 
2282    if (devinfo->gen >= 6) {
2283       /* headerless version, just submit color payload */
2284       src0 = payload;
2285 
2286       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2287    } else {
2288       assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2289       brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2290       src0 = implied_header;
2291 
2292       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2293    }
2294 
2295    brw_set_dest(p, insn, dest);
2296    brw_set_src0(p, insn, src0);
2297    brw_set_dp_write_message(p,
2298 			    insn,
2299 			    binding_table_index,
2300 			    msg_control,
2301 			    msg_type,
2302                             target_cache,
2303 			    msg_length,
2304 			    header_present,
2305 			    last_render_target,
2306 			    response_length,
2307 			    eot,
2308 			    0 /* send_commit_msg */);
2309 }
2310 
2311 brw_inst *
gen9_fb_READ(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)2312 gen9_fb_READ(struct brw_codegen *p,
2313              struct brw_reg dst,
2314              struct brw_reg payload,
2315              unsigned binding_table_index,
2316              unsigned msg_length,
2317              unsigned response_length,
2318              bool per_sample)
2319 {
2320    const struct gen_device_info *devinfo = p->devinfo;
2321    assert(devinfo->gen >= 9);
2322    const unsigned msg_subtype =
2323       brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
2324    brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2325 
2326    brw_set_dest(p, insn, dst);
2327    brw_set_src0(p, insn, payload);
2328    brw_set_dp_read_message(p, insn, binding_table_index,
2329                            per_sample << 5 | msg_subtype,
2330                            GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2331                            GEN6_SFID_DATAPORT_RENDER_CACHE,
2332                            msg_length, true /* header_present */,
2333                            response_length);
2334    brw_inst_set_rt_slot_group(devinfo, insn,
2335                               brw_inst_qtr_control(devinfo, p->current) / 2);
2336 
2337    return insn;
2338 }
2339 
2340 /**
2341  * Texture sample instruction.
2342  * Note: the msg_type plus msg_length values determine exactly what kind
2343  * of sampling operation is performed.  See volume 4, page 161 of docs.
2344  */
brw_SAMPLE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2345 void brw_SAMPLE(struct brw_codegen *p,
2346 		struct brw_reg dest,
2347 		unsigned msg_reg_nr,
2348 		struct brw_reg src0,
2349 		unsigned binding_table_index,
2350 		unsigned sampler,
2351 		unsigned msg_type,
2352 		unsigned response_length,
2353 		unsigned msg_length,
2354 		unsigned header_present,
2355 		unsigned simd_mode,
2356 		unsigned return_format)
2357 {
2358    const struct gen_device_info *devinfo = p->devinfo;
2359    brw_inst *insn;
2360 
2361    if (msg_reg_nr != -1)
2362       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2363 
2364    insn = next_insn(p, BRW_OPCODE_SEND);
2365    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2366 
2367    /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2368     *
2369     *    "Instruction compression is not allowed for this instruction (that
2370     *     is, send). The hardware behavior is undefined if this instruction is
2371     *     set as compressed. However, compress control can be set to "SecHalf"
2372     *     to affect the EMask generation."
2373     *
2374     * No similar wording is found in later PRMs, but there are examples
2375     * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2376     * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2377     * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2378     */
2379    brw_inst_set_compression(devinfo, insn, false);
2380 
2381    if (devinfo->gen < 6)
2382       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2383 
2384    brw_set_dest(p, insn, dest);
2385    brw_set_src0(p, insn, src0);
2386    brw_set_sampler_message(p, insn,
2387                            binding_table_index,
2388                            sampler,
2389                            msg_type,
2390                            response_length,
2391                            msg_length,
2392                            header_present,
2393                            simd_mode,
2394                            return_format);
2395 }
2396 
2397 /* Adjust the message header's sampler state pointer to
2398  * select the correct group of 16 samplers.
2399  */
brw_adjust_sampler_state_pointer(struct brw_codegen * p,struct brw_reg header,struct brw_reg sampler_index)2400 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2401                                       struct brw_reg header,
2402                                       struct brw_reg sampler_index)
2403 {
2404    /* The "Sampler Index" field can only store values between 0 and 15.
2405     * However, we can add an offset to the "Sampler State Pointer"
2406     * field, effectively selecting a different set of 16 samplers.
2407     *
2408     * The "Sampler State Pointer" needs to be aligned to a 32-byte
2409     * offset, and each sampler state is only 16-bytes, so we can't
2410     * exclusively use the offset - we have to use both.
2411     */
2412 
2413    const struct gen_device_info *devinfo = p->devinfo;
2414 
2415    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2416       const int sampler_state_size = 16; /* 16 bytes */
2417       uint32_t sampler = sampler_index.ud;
2418 
2419       if (sampler >= 16) {
2420          assert(devinfo->is_haswell || devinfo->gen >= 8);
2421          brw_ADD(p,
2422                  get_element_ud(header, 3),
2423                  get_element_ud(brw_vec8_grf(0, 0), 3),
2424                  brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2425       }
2426    } else {
2427       /* Non-const sampler array indexing case */
2428       if (devinfo->gen < 8 && !devinfo->is_haswell) {
2429          return;
2430       }
2431 
2432       struct brw_reg temp = get_element_ud(header, 3);
2433 
2434       brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2435       brw_SHL(p, temp, temp, brw_imm_ud(4));
2436       brw_ADD(p,
2437               get_element_ud(header, 3),
2438               get_element_ud(brw_vec8_grf(0, 0), 3),
2439               temp);
2440    }
2441 }
2442 
2443 /* All these variables are pretty confusing - we might be better off
2444  * using bitmasks and macros for this, in the old style.  Or perhaps
2445  * just having the caller instantiate the fields in dword3 itself.
2446  */
brw_urb_WRITE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2447 void brw_urb_WRITE(struct brw_codegen *p,
2448 		   struct brw_reg dest,
2449 		   unsigned msg_reg_nr,
2450 		   struct brw_reg src0,
2451                    enum brw_urb_write_flags flags,
2452 		   unsigned msg_length,
2453 		   unsigned response_length,
2454 		   unsigned offset,
2455 		   unsigned swizzle)
2456 {
2457    const struct gen_device_info *devinfo = p->devinfo;
2458    brw_inst *insn;
2459 
2460    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2461 
2462    if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2463       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2464       brw_push_insn_state(p);
2465       brw_set_default_access_mode(p, BRW_ALIGN_1);
2466       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2467       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2468       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2469 		       BRW_REGISTER_TYPE_UD),
2470 	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2471 		brw_imm_ud(0xff00));
2472       brw_pop_insn_state(p);
2473    }
2474 
2475    insn = next_insn(p, BRW_OPCODE_SEND);
2476 
2477    assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2478 
2479    brw_set_dest(p, insn, dest);
2480    brw_set_src0(p, insn, src0);
2481    brw_set_src1(p, insn, brw_imm_d(0));
2482 
2483    if (devinfo->gen < 6)
2484       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2485 
2486    brw_set_urb_message(p,
2487 		       insn,
2488 		       flags,
2489 		       msg_length,
2490 		       response_length,
2491 		       offset,
2492 		       swizzle);
2493 }
2494 
2495 struct brw_inst *
brw_send_indirect_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg desc)2496 brw_send_indirect_message(struct brw_codegen *p,
2497                           unsigned sfid,
2498                           struct brw_reg dst,
2499                           struct brw_reg payload,
2500                           struct brw_reg desc)
2501 {
2502    const struct gen_device_info *devinfo = p->devinfo;
2503    struct brw_inst *send;
2504    int setup;
2505 
2506    dst = retype(dst, BRW_REGISTER_TYPE_UW);
2507 
2508    assert(desc.type == BRW_REGISTER_TYPE_UD);
2509 
2510    /* We hold on to the setup instruction (the SEND in the direct case, the OR
2511     * in the indirect case) by its index in the instruction store.  The
2512     * pointer returned by next_insn() may become invalid if emitting the SEND
2513     * in the indirect case reallocs the store.
2514     */
2515 
2516    if (desc.file == BRW_IMMEDIATE_VALUE) {
2517       setup = p->nr_insn;
2518       send = next_insn(p, BRW_OPCODE_SEND);
2519       brw_set_src1(p, send, desc);
2520 
2521    } else {
2522       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2523 
2524       brw_push_insn_state(p);
2525       brw_set_default_access_mode(p, BRW_ALIGN_1);
2526       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2527       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2528       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2529 
2530       /* Load the indirect descriptor to an address register using OR so the
2531        * caller can specify additional descriptor bits with the usual
2532        * brw_set_*_message() helper functions.
2533        */
2534       setup = p->nr_insn;
2535       brw_OR(p, addr, desc, brw_imm_ud(0));
2536 
2537       brw_pop_insn_state(p);
2538 
2539       send = next_insn(p, BRW_OPCODE_SEND);
2540       brw_set_src1(p, send, addr);
2541    }
2542 
2543    if (dst.width < BRW_EXECUTE_8)
2544       brw_inst_set_exec_size(devinfo, send, dst.width);
2545 
2546    brw_set_dest(p, send, dst);
2547    brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2548    brw_inst_set_sfid(devinfo, send, sfid);
2549 
2550    return &p->store[setup];
2551 }
2552 
2553 static struct brw_inst *
brw_send_indirect_surface_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned message_len,unsigned response_len,bool header_present)2554 brw_send_indirect_surface_message(struct brw_codegen *p,
2555                                   unsigned sfid,
2556                                   struct brw_reg dst,
2557                                   struct brw_reg payload,
2558                                   struct brw_reg surface,
2559                                   unsigned message_len,
2560                                   unsigned response_len,
2561                                   bool header_present)
2562 {
2563    const struct gen_device_info *devinfo = p->devinfo;
2564    struct brw_inst *insn;
2565 
2566    if (surface.file != BRW_IMMEDIATE_VALUE) {
2567       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2568 
2569       brw_push_insn_state(p);
2570       brw_set_default_access_mode(p, BRW_ALIGN_1);
2571       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2572       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2573       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2574 
2575       /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2576        * some surface array is accessed out of bounds.
2577        */
2578       insn = brw_AND(p, addr,
2579                      suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2580                                BRW_GET_SWZ(surface.swizzle, 0)),
2581                      brw_imm_ud(0xff));
2582 
2583       brw_pop_insn_state(p);
2584 
2585       surface = addr;
2586    }
2587 
2588    insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2589    brw_inst_set_mlen(devinfo, insn, message_len);
2590    brw_inst_set_rlen(devinfo, insn, response_len);
2591    brw_inst_set_header_present(devinfo, insn, header_present);
2592 
2593    return insn;
2594 }
2595 
2596 static bool
while_jumps_before_offset(const struct gen_device_info * devinfo,brw_inst * insn,int while_offset,int start_offset)2597 while_jumps_before_offset(const struct gen_device_info *devinfo,
2598                           brw_inst *insn, int while_offset, int start_offset)
2599 {
2600    int scale = 16 / brw_jump_scale(devinfo);
2601    int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2602                                : brw_inst_jip(devinfo, insn);
2603    assert(jip < 0);
2604    return while_offset + jip * scale <= start_offset;
2605 }
2606 
2607 
2608 static int
brw_find_next_block_end(struct brw_codegen * p,int start_offset)2609 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2610 {
2611    int offset;
2612    void *store = p->store;
2613    const struct gen_device_info *devinfo = p->devinfo;
2614 
2615    int depth = 0;
2616 
2617    for (offset = next_offset(devinfo, store, start_offset);
2618         offset < p->next_insn_offset;
2619         offset = next_offset(devinfo, store, offset)) {
2620       brw_inst *insn = store + offset;
2621 
2622       switch (brw_inst_opcode(devinfo, insn)) {
2623       case BRW_OPCODE_IF:
2624          depth++;
2625          break;
2626       case BRW_OPCODE_ENDIF:
2627          if (depth == 0)
2628             return offset;
2629          depth--;
2630          break;
2631       case BRW_OPCODE_WHILE:
2632          /* If the while doesn't jump before our instruction, it's the end
2633           * of a sibling do...while loop.  Ignore it.
2634           */
2635          if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2636             continue;
2637          /* fallthrough */
2638       case BRW_OPCODE_ELSE:
2639       case BRW_OPCODE_HALT:
2640          if (depth == 0)
2641             return offset;
2642       }
2643    }
2644 
2645    return 0;
2646 }
2647 
2648 /* There is no DO instruction on gen6, so to find the end of the loop
2649  * we have to see if the loop is jumping back before our start
2650  * instruction.
2651  */
2652 static int
brw_find_loop_end(struct brw_codegen * p,int start_offset)2653 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2654 {
2655    const struct gen_device_info *devinfo = p->devinfo;
2656    int offset;
2657    void *store = p->store;
2658 
2659    assert(devinfo->gen >= 6);
2660 
2661    /* Always start after the instruction (such as a WHILE) we're trying to fix
2662     * up.
2663     */
2664    for (offset = next_offset(devinfo, store, start_offset);
2665         offset < p->next_insn_offset;
2666         offset = next_offset(devinfo, store, offset)) {
2667       brw_inst *insn = store + offset;
2668 
2669       if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2670 	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2671 	    return offset;
2672       }
2673    }
2674    assert(!"not reached");
2675    return start_offset;
2676 }
2677 
2678 /* After program generation, go back and update the UIP and JIP of
2679  * BREAK, CONT, and HALT instructions to their correct locations.
2680  */
2681 void
brw_set_uip_jip(struct brw_codegen * p,int start_offset)2682 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2683 {
2684    const struct gen_device_info *devinfo = p->devinfo;
2685    int offset;
2686    int br = brw_jump_scale(devinfo);
2687    int scale = 16 / br;
2688    void *store = p->store;
2689 
2690    if (devinfo->gen < 6)
2691       return;
2692 
2693    for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2694       brw_inst *insn = store + offset;
2695       assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2696 
2697       int block_end_offset = brw_find_next_block_end(p, offset);
2698       switch (brw_inst_opcode(devinfo, insn)) {
2699       case BRW_OPCODE_BREAK:
2700          assert(block_end_offset != 0);
2701          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2702 	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2703          brw_inst_set_uip(devinfo, insn,
2704 	    (brw_find_loop_end(p, offset) - offset +
2705              (devinfo->gen == 6 ? 16 : 0)) / scale);
2706 	 break;
2707       case BRW_OPCODE_CONTINUE:
2708          assert(block_end_offset != 0);
2709          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2710          brw_inst_set_uip(devinfo, insn,
2711             (brw_find_loop_end(p, offset) - offset) / scale);
2712 
2713          assert(brw_inst_uip(devinfo, insn) != 0);
2714          assert(brw_inst_jip(devinfo, insn) != 0);
2715 	 break;
2716 
2717       case BRW_OPCODE_ENDIF: {
2718          int32_t jump = (block_end_offset == 0) ?
2719                         1 * br : (block_end_offset - offset) / scale;
2720          if (devinfo->gen >= 7)
2721             brw_inst_set_jip(devinfo, insn, jump);
2722          else
2723             brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2724 	 break;
2725       }
2726 
2727       case BRW_OPCODE_HALT:
2728 	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2729 	  *
2730 	  *    "In case of the halt instruction not inside any conditional
2731 	  *     code block, the value of <JIP> and <UIP> should be the
2732 	  *     same. In case of the halt instruction inside conditional code
2733 	  *     block, the <UIP> should be the end of the program, and the
2734 	  *     <JIP> should be end of the most inner conditional code block."
2735 	  *
2736 	  * The uip will have already been set by whoever set up the
2737 	  * instruction.
2738 	  */
2739 	 if (block_end_offset == 0) {
2740             brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2741 	 } else {
2742             brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2743 	 }
2744          assert(brw_inst_uip(devinfo, insn) != 0);
2745          assert(brw_inst_jip(devinfo, insn) != 0);
2746 	 break;
2747       }
2748    }
2749 }
2750 
brw_ff_sync(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,unsigned response_length,bool eot)2751 void brw_ff_sync(struct brw_codegen *p,
2752 		   struct brw_reg dest,
2753 		   unsigned msg_reg_nr,
2754 		   struct brw_reg src0,
2755 		   bool allocate,
2756 		   unsigned response_length,
2757 		   bool eot)
2758 {
2759    const struct gen_device_info *devinfo = p->devinfo;
2760    brw_inst *insn;
2761 
2762    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2763 
2764    insn = next_insn(p, BRW_OPCODE_SEND);
2765    brw_set_dest(p, insn, dest);
2766    brw_set_src0(p, insn, src0);
2767    brw_set_src1(p, insn, brw_imm_d(0));
2768 
2769    if (devinfo->gen < 6)
2770       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2771 
2772    brw_set_ff_sync_message(p,
2773 			   insn,
2774 			   allocate,
2775 			   response_length,
2776 			   eot);
2777 }
2778 
2779 /**
2780  * Emit the SEND instruction necessary to generate stream output data on Gen6
2781  * (for transform feedback).
2782  *
2783  * If send_commit_msg is true, this is the last piece of stream output data
2784  * from this thread, so send the data as a committed write.  According to the
2785  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2786  *
2787  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2788  *   writes are complete by sending the final write as a committed write."
2789  */
2790 void
brw_svb_write(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,bool send_commit_msg)2791 brw_svb_write(struct brw_codegen *p,
2792               struct brw_reg dest,
2793               unsigned msg_reg_nr,
2794               struct brw_reg src0,
2795               unsigned binding_table_index,
2796               bool   send_commit_msg)
2797 {
2798    const struct gen_device_info *devinfo = p->devinfo;
2799    const unsigned target_cache =
2800       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2801        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2802        BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2803    brw_inst *insn;
2804 
2805    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2806 
2807    insn = next_insn(p, BRW_OPCODE_SEND);
2808    brw_set_dest(p, insn, dest);
2809    brw_set_src0(p, insn, src0);
2810    brw_set_src1(p, insn, brw_imm_d(0));
2811    brw_set_dp_write_message(p, insn,
2812                             binding_table_index,
2813                             0, /* msg_control: ignored */
2814                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2815                             target_cache,
2816                             1, /* msg_length */
2817                             true, /* header_present */
2818                             0, /* last_render_target: ignored */
2819                             send_commit_msg, /* response_length */
2820                             0, /* end_of_thread */
2821                             send_commit_msg); /* send_commit_msg */
2822 }
2823 
2824 static unsigned
brw_surface_payload_size(struct brw_codegen * p,unsigned num_channels,bool has_simd4x2,bool has_simd16)2825 brw_surface_payload_size(struct brw_codegen *p,
2826                          unsigned num_channels,
2827                          bool has_simd4x2,
2828                          bool has_simd16)
2829 {
2830    if (has_simd4x2 &&
2831        brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2832       return 1;
2833    else if (has_simd16 &&
2834             brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2835       return 2 * num_channels;
2836    else
2837       return num_channels;
2838 }
2839 
2840 static void
brw_set_dp_untyped_atomic_message(struct brw_codegen * p,brw_inst * insn,unsigned atomic_op,bool response_expected)2841 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2842                                   brw_inst *insn,
2843                                   unsigned atomic_op,
2844                                   bool response_expected)
2845 {
2846    const struct gen_device_info *devinfo = p->devinfo;
2847    unsigned msg_control =
2848       atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2849       (response_expected ? 1 << 5 : 0); /* Return data expected */
2850 
2851    if (devinfo->gen >= 8 || devinfo->is_haswell) {
2852       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2853          if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2854             msg_control |= 1 << 4; /* SIMD8 mode */
2855 
2856          brw_inst_set_dp_msg_type(devinfo, insn,
2857                                   HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2858       } else {
2859          brw_inst_set_dp_msg_type(devinfo, insn,
2860             HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2861       }
2862    } else {
2863       brw_inst_set_dp_msg_type(devinfo, insn,
2864                                GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2865 
2866       if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2867          msg_control |= 1 << 4; /* SIMD8 mode */
2868    }
2869 
2870    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2871 }
2872 
2873 void
brw_untyped_atomic(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected)2874 brw_untyped_atomic(struct brw_codegen *p,
2875                    struct brw_reg dst,
2876                    struct brw_reg payload,
2877                    struct brw_reg surface,
2878                    unsigned atomic_op,
2879                    unsigned msg_length,
2880                    bool response_expected)
2881 {
2882    const struct gen_device_info *devinfo = p->devinfo;
2883    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2884                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
2885                           GEN7_SFID_DATAPORT_DATA_CACHE);
2886    const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2887    /* Mask out unused components -- This is especially important in Align16
2888     * mode on generations that don't have native support for SIMD4x2 atomics,
2889     * because unused but enabled components will cause the dataport to perform
2890     * additional atomic operations on the addresses that happen to be in the
2891     * uninitialized Y, Z and W coordinates of the payload.
2892     */
2893    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2894    struct brw_inst *insn = brw_send_indirect_surface_message(
2895       p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2896       brw_surface_payload_size(p, response_expected,
2897                                devinfo->gen >= 8 || devinfo->is_haswell, true),
2898       align1);
2899 
2900    brw_set_dp_untyped_atomic_message(
2901       p, insn, atomic_op, response_expected);
2902 }
2903 
2904 static void
brw_set_dp_untyped_surface_read_message(struct brw_codegen * p,struct brw_inst * insn,unsigned num_channels)2905 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2906                                         struct brw_inst *insn,
2907                                         unsigned num_channels)
2908 {
2909    const struct gen_device_info *devinfo = p->devinfo;
2910    /* Set mask of 32-bit channels to drop. */
2911    unsigned msg_control = 0xf & (0xf << num_channels);
2912 
2913    if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2914       if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
2915          msg_control |= 1 << 4; /* SIMD16 mode */
2916       else
2917          msg_control |= 2 << 4; /* SIMD8 mode */
2918    }
2919 
2920    brw_inst_set_dp_msg_type(devinfo, insn,
2921                             (devinfo->gen >= 8 || devinfo->is_haswell ?
2922                              HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2923                              GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2924    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2925 }
2926 
2927 void
brw_untyped_surface_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)2928 brw_untyped_surface_read(struct brw_codegen *p,
2929                          struct brw_reg dst,
2930                          struct brw_reg payload,
2931                          struct brw_reg surface,
2932                          unsigned msg_length,
2933                          unsigned num_channels)
2934 {
2935    const struct gen_device_info *devinfo = p->devinfo;
2936    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2937                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
2938                           GEN7_SFID_DATAPORT_DATA_CACHE);
2939    struct brw_inst *insn = brw_send_indirect_surface_message(
2940       p, sfid, dst, payload, surface, msg_length,
2941       brw_surface_payload_size(p, num_channels, true, true),
2942       false);
2943 
2944    brw_set_dp_untyped_surface_read_message(
2945       p, insn, num_channels);
2946 }
2947 
2948 static void
brw_set_dp_untyped_surface_write_message(struct brw_codegen * p,struct brw_inst * insn,unsigned num_channels)2949 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
2950                                          struct brw_inst *insn,
2951                                          unsigned num_channels)
2952 {
2953    const struct gen_device_info *devinfo = p->devinfo;
2954    /* Set mask of 32-bit channels to drop. */
2955    unsigned msg_control = 0xf & (0xf << num_channels);
2956 
2957    if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2958       if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
2959          msg_control |= 1 << 4; /* SIMD16 mode */
2960       else
2961          msg_control |= 2 << 4; /* SIMD8 mode */
2962    } else {
2963       if (devinfo->gen >= 8 || devinfo->is_haswell)
2964          msg_control |= 0 << 4; /* SIMD4x2 mode */
2965       else
2966          msg_control |= 2 << 4; /* SIMD8 mode */
2967    }
2968 
2969    brw_inst_set_dp_msg_type(devinfo, insn,
2970                             devinfo->gen >= 8 || devinfo->is_haswell ?
2971                              HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
2972                              GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
2973    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2974 }
2975 
2976 void
brw_untyped_surface_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)2977 brw_untyped_surface_write(struct brw_codegen *p,
2978                           struct brw_reg payload,
2979                           struct brw_reg surface,
2980                           unsigned msg_length,
2981                           unsigned num_channels)
2982 {
2983    const struct gen_device_info *devinfo = p->devinfo;
2984    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2985                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
2986                           GEN7_SFID_DATAPORT_DATA_CACHE);
2987    const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2988    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2989    const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
2990                           WRITEMASK_X : WRITEMASK_XYZW;
2991    struct brw_inst *insn = brw_send_indirect_surface_message(
2992       p, sfid, brw_writemask(brw_null_reg(), mask),
2993       payload, surface, msg_length, 0, align1);
2994 
2995    brw_set_dp_untyped_surface_write_message(
2996       p, insn, num_channels);
2997 }
2998 
2999 static unsigned
brw_byte_scattered_data_element_from_bit_size(unsigned bit_size)3000 brw_byte_scattered_data_element_from_bit_size(unsigned bit_size)
3001 {
3002    switch (bit_size) {
3003    case 8:
3004       return GEN7_BYTE_SCATTERED_DATA_ELEMENT_BYTE;
3005    case 16:
3006       return GEN7_BYTE_SCATTERED_DATA_ELEMENT_WORD;
3007    case 32:
3008       return GEN7_BYTE_SCATTERED_DATA_ELEMENT_DWORD;
3009    default:
3010       unreachable("Unsupported bit_size for byte scattered messages");
3011    }
3012 }
3013 
3014 
3015 void
brw_byte_scattered_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned bit_size)3016 brw_byte_scattered_read(struct brw_codegen *p,
3017                         struct brw_reg dst,
3018                         struct brw_reg payload,
3019                         struct brw_reg surface,
3020                         unsigned msg_length,
3021                         unsigned bit_size)
3022 {
3023    const struct gen_device_info *devinfo = p->devinfo;
3024    assert(devinfo->gen > 7 || devinfo->is_haswell);
3025    assert(brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3026    const unsigned sfid =  GEN7_SFID_DATAPORT_DATA_CACHE;
3027 
3028    struct brw_inst *insn = brw_send_indirect_surface_message(
3029       p, sfid, dst, payload, surface, msg_length,
3030       brw_surface_payload_size(p, 1, true, true),
3031       false);
3032 
3033    unsigned msg_control =
3034       brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
3035 
3036    if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3037       msg_control |= 1; /* SIMD16 mode */
3038    else
3039       msg_control |= 0; /* SIMD8 mode */
3040 
3041    brw_inst_set_dp_msg_type(devinfo, insn,
3042                             HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ);
3043    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3044 }
3045 
3046 void
brw_byte_scattered_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned bit_size)3047 brw_byte_scattered_write(struct brw_codegen *p,
3048                          struct brw_reg payload,
3049                          struct brw_reg surface,
3050                          unsigned msg_length,
3051                          unsigned bit_size)
3052 {
3053    const struct gen_device_info *devinfo = p->devinfo;
3054    assert(devinfo->gen > 7 || devinfo->is_haswell);
3055    assert(brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3056    const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
3057 
3058    struct brw_inst *insn = brw_send_indirect_surface_message(
3059       p, sfid, brw_writemask(brw_null_reg(), WRITEMASK_XYZW),
3060       payload, surface, msg_length, 0, true);
3061 
3062    unsigned msg_control =
3063       brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
3064 
3065    if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3066       msg_control |= 1;
3067    else
3068       msg_control |= 0;
3069 
3070    brw_inst_set_dp_msg_type(devinfo, insn,
3071                             HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE);
3072    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3073 }
3074 
3075 static void
brw_set_dp_typed_atomic_message(struct brw_codegen * p,struct brw_inst * insn,unsigned atomic_op,bool response_expected)3076 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3077                                 struct brw_inst *insn,
3078                                 unsigned atomic_op,
3079                                 bool response_expected)
3080 {
3081    const struct gen_device_info *devinfo = p->devinfo;
3082    unsigned msg_control =
3083       atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3084       (response_expected ? 1 << 5 : 0); /* Return data expected */
3085 
3086    if (devinfo->gen >= 8 || devinfo->is_haswell) {
3087       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3088          if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3089             msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3090 
3091          brw_inst_set_dp_msg_type(devinfo, insn,
3092                                   HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3093       } else {
3094          brw_inst_set_dp_msg_type(devinfo, insn,
3095                                   HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3096       }
3097 
3098    } else {
3099       brw_inst_set_dp_msg_type(devinfo, insn,
3100                                GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3101 
3102       if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3103          msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3104    }
3105 
3106    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3107 }
3108 
3109 void
brw_typed_atomic(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected)3110 brw_typed_atomic(struct brw_codegen *p,
3111                  struct brw_reg dst,
3112                  struct brw_reg payload,
3113                  struct brw_reg surface,
3114                  unsigned atomic_op,
3115                  unsigned msg_length,
3116                  bool response_expected) {
3117    const struct gen_device_info *devinfo = p->devinfo;
3118    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3119                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3120                           GEN6_SFID_DATAPORT_RENDER_CACHE);
3121    const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3122    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3123    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3124    struct brw_inst *insn = brw_send_indirect_surface_message(
3125       p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3126       brw_surface_payload_size(p, response_expected,
3127                                devinfo->gen >= 8 || devinfo->is_haswell, false),
3128       true);
3129 
3130    brw_set_dp_typed_atomic_message(
3131       p, insn, atomic_op, response_expected);
3132 }
3133 
3134 static void
brw_set_dp_typed_surface_read_message(struct brw_codegen * p,struct brw_inst * insn,unsigned num_channels)3135 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3136                                       struct brw_inst *insn,
3137                                       unsigned num_channels)
3138 {
3139    const struct gen_device_info *devinfo = p->devinfo;
3140    /* Set mask of unused channels. */
3141    unsigned msg_control = 0xf & (0xf << num_channels);
3142 
3143    if (devinfo->gen >= 8 || devinfo->is_haswell) {
3144       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3145          if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3146             msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3147          else
3148             msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3149       }
3150 
3151       brw_inst_set_dp_msg_type(devinfo, insn,
3152                                HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3153    } else {
3154       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3155          if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3156             msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3157       }
3158 
3159       brw_inst_set_dp_msg_type(devinfo, insn,
3160                                GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3161    }
3162 
3163    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3164 }
3165 
3166 void
brw_typed_surface_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)3167 brw_typed_surface_read(struct brw_codegen *p,
3168                        struct brw_reg dst,
3169                        struct brw_reg payload,
3170                        struct brw_reg surface,
3171                        unsigned msg_length,
3172                        unsigned num_channels)
3173 {
3174    const struct gen_device_info *devinfo = p->devinfo;
3175    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3176                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3177                           GEN6_SFID_DATAPORT_RENDER_CACHE);
3178    struct brw_inst *insn = brw_send_indirect_surface_message(
3179       p, sfid, dst, payload, surface, msg_length,
3180       brw_surface_payload_size(p, num_channels,
3181                                devinfo->gen >= 8 || devinfo->is_haswell, false),
3182       true);
3183 
3184    brw_set_dp_typed_surface_read_message(
3185       p, insn, num_channels);
3186 }
3187 
3188 static void
brw_set_dp_typed_surface_write_message(struct brw_codegen * p,struct brw_inst * insn,unsigned num_channels)3189 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3190                                        struct brw_inst *insn,
3191                                        unsigned num_channels)
3192 {
3193    const struct gen_device_info *devinfo = p->devinfo;
3194    /* Set mask of unused channels. */
3195    unsigned msg_control = 0xf & (0xf << num_channels);
3196 
3197    if (devinfo->gen >= 8 || devinfo->is_haswell) {
3198       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3199          if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3200             msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3201          else
3202             msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3203       }
3204 
3205       brw_inst_set_dp_msg_type(devinfo, insn,
3206                                HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3207 
3208    } else {
3209       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3210          if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3211             msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3212       }
3213 
3214       brw_inst_set_dp_msg_type(devinfo, insn,
3215                                GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3216    }
3217 
3218    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3219 }
3220 
3221 void
brw_typed_surface_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)3222 brw_typed_surface_write(struct brw_codegen *p,
3223                         struct brw_reg payload,
3224                         struct brw_reg surface,
3225                         unsigned msg_length,
3226                         unsigned num_channels)
3227 {
3228    const struct gen_device_info *devinfo = p->devinfo;
3229    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3230                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3231                           GEN6_SFID_DATAPORT_RENDER_CACHE);
3232    const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3233    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3234    const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3235                           WRITEMASK_X : WRITEMASK_XYZW);
3236    struct brw_inst *insn = brw_send_indirect_surface_message(
3237       p, sfid, brw_writemask(brw_null_reg(), mask),
3238       payload, surface, msg_length, 0, true);
3239 
3240    brw_set_dp_typed_surface_write_message(
3241       p, insn, num_channels);
3242 }
3243 
3244 static void
brw_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,bool commit_enable)3245 brw_set_memory_fence_message(struct brw_codegen *p,
3246                              struct brw_inst *insn,
3247                              enum brw_message_target sfid,
3248                              bool commit_enable)
3249 {
3250    const struct gen_device_info *devinfo = p->devinfo;
3251 
3252    brw_set_message_descriptor(p, insn, sfid,
3253                               1 /* message length */,
3254                               (commit_enable ? 1 : 0) /* response length */,
3255                               true /* header present */,
3256                               false);
3257 
3258    switch (sfid) {
3259    case GEN6_SFID_DATAPORT_RENDER_CACHE:
3260       brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3261       break;
3262    case GEN7_SFID_DATAPORT_DATA_CACHE:
3263       brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3264       break;
3265    default:
3266       unreachable("Not reached");
3267    }
3268 
3269    if (commit_enable)
3270       brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3271 }
3272 
3273 void
brw_memory_fence(struct brw_codegen * p,struct brw_reg dst)3274 brw_memory_fence(struct brw_codegen *p,
3275                  struct brw_reg dst)
3276 {
3277    const struct gen_device_info *devinfo = p->devinfo;
3278    const bool commit_enable =
3279       devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3280       (devinfo->gen == 7 && !devinfo->is_haswell);
3281    struct brw_inst *insn;
3282 
3283    brw_push_insn_state(p);
3284    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3285    brw_set_default_exec_size(p, BRW_EXECUTE_1);
3286    dst = vec1(dst);
3287 
3288    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3289     * message doesn't write anything back.
3290     */
3291    insn = next_insn(p, BRW_OPCODE_SEND);
3292    dst = retype(dst, BRW_REGISTER_TYPE_UW);
3293    brw_set_dest(p, insn, dst);
3294    brw_set_src0(p, insn, dst);
3295    brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3296                                 commit_enable);
3297 
3298    if (devinfo->gen == 7 && !devinfo->is_haswell) {
3299       /* IVB does typed surface access through the render cache, so we need to
3300        * flush it too.  Use a different register so both flushes can be
3301        * pipelined by the hardware.
3302        */
3303       insn = next_insn(p, BRW_OPCODE_SEND);
3304       brw_set_dest(p, insn, offset(dst, 1));
3305       brw_set_src0(p, insn, offset(dst, 1));
3306       brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3307                                    commit_enable);
3308 
3309       /* Now write the response of the second message into the response of the
3310        * first to trigger a pipeline stall -- This way future render and data
3311        * cache messages will be properly ordered with respect to past data and
3312        * render cache messages.
3313        */
3314       brw_MOV(p, dst, offset(dst, 1));
3315    }
3316 
3317    brw_pop_insn_state(p);
3318 }
3319 
3320 void
brw_pixel_interpolator_query(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,bool noperspective,unsigned mode,struct brw_reg data,unsigned msg_length,unsigned response_length)3321 brw_pixel_interpolator_query(struct brw_codegen *p,
3322                              struct brw_reg dest,
3323                              struct brw_reg mrf,
3324                              bool noperspective,
3325                              unsigned mode,
3326                              struct brw_reg data,
3327                              unsigned msg_length,
3328                              unsigned response_length)
3329 {
3330    const struct gen_device_info *devinfo = p->devinfo;
3331    struct brw_inst *insn;
3332    const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3333 
3334    /* brw_send_indirect_message will automatically use a direct send message
3335     * if data is actually immediate.
3336     */
3337    insn = brw_send_indirect_message(p,
3338                                     GEN7_SFID_PIXEL_INTERPOLATOR,
3339                                     dest,
3340                                     mrf,
3341                                     vec1(data));
3342    brw_inst_set_mlen(devinfo, insn, msg_length);
3343    brw_inst_set_rlen(devinfo, insn, response_length);
3344 
3345    brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3346    brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3347    brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3348    brw_inst_set_pi_message_type(devinfo, insn, mode);
3349 }
3350 
3351 void
brw_find_live_channel(struct brw_codegen * p,struct brw_reg dst,struct brw_reg mask)3352 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3353                       struct brw_reg mask)
3354 {
3355    const struct gen_device_info *devinfo = p->devinfo;
3356    const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
3357    const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
3358    brw_inst *inst;
3359 
3360    assert(devinfo->gen >= 7);
3361    assert(mask.type == BRW_REGISTER_TYPE_UD);
3362 
3363    brw_push_insn_state(p);
3364 
3365    if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3366       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3367 
3368       if (devinfo->gen >= 8) {
3369          /* Getting the first active channel index is easy on Gen8: Just find
3370           * the first bit set in the execution mask.  The register exists on
3371           * HSW already but it reads back as all ones when the current
3372           * instruction has execution masking disabled, so it's kind of
3373           * useless.
3374           */
3375          struct brw_reg exec_mask =
3376             retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3377 
3378          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3379          if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3380             /* Unfortunately, ce0 does not take into account the thread
3381              * dispatch mask, which may be a problem in cases where it's not
3382              * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3383              * some n).  Combine ce0 with the given dispatch (or vector) mask
3384              * to mask off those channels which were never dispatched by the
3385              * hardware.
3386              */
3387             brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3388             brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3389             exec_mask = vec1(dst);
3390          }
3391 
3392          /* Quarter control has the effect of magically shifting the value of
3393           * ce0 so you'll get the first active channel relative to the
3394           * specified quarter control as result.
3395           */
3396          inst = brw_FBL(p, vec1(dst), exec_mask);
3397       } else {
3398          const struct brw_reg flag = brw_flag_reg(1, 0);
3399 
3400          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3401          brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3402 
3403          /* Run enough instructions returning zero with execution masking and
3404           * a conditional modifier enabled in order to get the full execution
3405           * mask in f1.0.  We could use a single 32-wide move here if it
3406           * weren't because of the hardware bug that causes channel enables to
3407           * be applied incorrectly to the second half of 32-wide instructions
3408           * on Gen7.
3409           */
3410          const unsigned lower_size = MIN2(16, exec_size);
3411          for (unsigned i = 0; i < exec_size / lower_size; i++) {
3412             inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3413                            brw_imm_uw(0));
3414             brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3415             brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3416             brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3417             brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3418             brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3419          }
3420 
3421          /* Find the first bit set in the exec_size-wide portion of the flag
3422           * register that was updated by the last sequence of MOV
3423           * instructions.
3424           */
3425          const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3426          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3427          brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3428       }
3429    } else {
3430       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3431 
3432       if (devinfo->gen >= 8 &&
3433           mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3434          /* In SIMD4x2 mode the first active channel index is just the
3435           * negation of the first bit of the mask register.  Note that ce0
3436           * doesn't take into account the dispatch mask, so the Gen7 path
3437           * should be used instead unless you have the guarantee that the
3438           * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3439           * for some n).
3440           */
3441          inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3442                         negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3443                         brw_imm_ud(1));
3444 
3445       } else {
3446          /* Overwrite the destination without and with execution masking to
3447           * find out which of the channels is active.
3448           */
3449          brw_push_insn_state(p);
3450          brw_set_default_exec_size(p, BRW_EXECUTE_4);
3451          brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3452                  brw_imm_ud(1));
3453 
3454          inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3455                         brw_imm_ud(0));
3456          brw_pop_insn_state(p);
3457          brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3458       }
3459    }
3460 
3461    brw_pop_insn_state(p);
3462 }
3463 
3464 void
brw_broadcast(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)3465 brw_broadcast(struct brw_codegen *p,
3466               struct brw_reg dst,
3467               struct brw_reg src,
3468               struct brw_reg idx)
3469 {
3470    const struct gen_device_info *devinfo = p->devinfo;
3471    const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3472    brw_inst *inst;
3473 
3474    brw_push_insn_state(p);
3475    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3476    brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3477 
3478    assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3479           src.address_mode == BRW_ADDRESS_DIRECT);
3480    assert(!src.abs && !src.negate);
3481    assert(src.type == dst.type);
3482 
3483    if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3484        idx.file == BRW_IMMEDIATE_VALUE) {
3485       /* Trivial, the source is already uniform or the index is a constant.
3486        * We will typically not get here if the optimizer is doing its job, but
3487        * asserting would be mean.
3488        */
3489       const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3490       brw_MOV(p, dst,
3491               (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3492                stride(suboffset(src, 4 * i), 0, 4, 1)));
3493    } else {
3494       /* From the Haswell PRM section "Register Region Restrictions":
3495        *
3496        *    "The lower bits of the AddressImmediate must not overflow to
3497        *    change the register address.  The lower 5 bits of Address
3498        *    Immediate when added to lower 5 bits of address register gives
3499        *    the sub-register offset. The upper bits of Address Immediate
3500        *    when added to upper bits of address register gives the register
3501        *    address. Any overflow from sub-register offset is dropped."
3502        *
3503        * Fortunately, for broadcast, we never have a sub-register offset so
3504        * this isn't an issue.
3505        */
3506       assert(src.subnr == 0);
3507 
3508       if (align1) {
3509          const struct brw_reg addr =
3510             retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3511          unsigned offset = src.nr * REG_SIZE + src.subnr;
3512          /* Limit in bytes of the signed indirect addressing immediate. */
3513          const unsigned limit = 512;
3514 
3515          brw_push_insn_state(p);
3516          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3517          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3518 
3519          /* Take into account the component size and horizontal stride. */
3520          assert(src.vstride == src.hstride + src.width);
3521          brw_SHL(p, addr, vec1(idx),
3522                  brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3523                             src.hstride - 1));
3524 
3525          /* We can only address up to limit bytes using the indirect
3526           * addressing immediate, account for the difference if the source
3527           * register is above this limit.
3528           */
3529          if (offset >= limit) {
3530             brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3531             offset = offset % limit;
3532          }
3533 
3534          brw_pop_insn_state(p);
3535 
3536          /* Use indirect addressing to fetch the specified component. */
3537          if (type_sz(src.type) > 4 &&
3538              (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3539             /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3540              *
3541              *    "When source or destination datatype is 64b or operation is
3542              *    integer DWord multiply, indirect addressing must not be
3543              *    used."
3544              *
3545              * To work around both of this issue, we do two integer MOVs
3546              * insead of one 64-bit MOV.  Because no double value should ever
3547              * cross a register boundary, it's safe to use the immediate
3548              * offset in the indirect here to handle adding 4 bytes to the
3549              * offset and avoid the extra ADD to the register file.
3550              */
3551             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3552                        retype(brw_vec1_indirect(addr.subnr, offset),
3553                               BRW_REGISTER_TYPE_D));
3554             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3555                        retype(brw_vec1_indirect(addr.subnr, offset + 4),
3556                               BRW_REGISTER_TYPE_D));
3557          } else {
3558             brw_MOV(p, dst,
3559                     retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3560          }
3561       } else {
3562          /* In SIMD4x2 mode the index can be either zero or one, replicate it
3563           * to all bits of a flag register,
3564           */
3565          inst = brw_MOV(p,
3566                         brw_null_reg(),
3567                         stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3568          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3569          brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3570          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3571 
3572          /* and use predicated SEL to pick the right channel. */
3573          inst = brw_SEL(p, dst,
3574                         stride(suboffset(src, 4), 4, 4, 1),
3575                         stride(src, 4, 4, 1));
3576          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3577          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3578       }
3579    }
3580 
3581    brw_pop_insn_state(p);
3582 }
3583 
3584 /**
3585  * This instruction is generated as a single-channel align1 instruction by
3586  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3587  *
3588  * We can't use the typed atomic op in the FS because that has the execution
3589  * mask ANDed with the pixel mask, but we just want to write the one dword for
3590  * all the pixels.
3591  *
3592  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3593  * one u32.  So we use the same untyped atomic write message as the pixel
3594  * shader.
3595  *
3596  * The untyped atomic operation requires a BUFFER surface type with RAW
3597  * format, and is only accessible through the legacy DATA_CACHE dataport
3598  * messages.
3599  */
brw_shader_time_add(struct brw_codegen * p,struct brw_reg payload,uint32_t surf_index)3600 void brw_shader_time_add(struct brw_codegen *p,
3601                          struct brw_reg payload,
3602                          uint32_t surf_index)
3603 {
3604    const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3605                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3606                           GEN7_SFID_DATAPORT_DATA_CACHE);
3607    assert(p->devinfo->gen >= 7);
3608 
3609    brw_push_insn_state(p);
3610    brw_set_default_access_mode(p, BRW_ALIGN_1);
3611    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3612    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3613    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3614 
3615    /* We use brw_vec1_reg and unmasked because we want to increment the given
3616     * offset only once.
3617     */
3618    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3619                                       BRW_ARF_NULL, 0));
3620    brw_set_src0(p, send, brw_vec1_reg(payload.file,
3621                                       payload.nr, 0));
3622    brw_set_src1(p, send, brw_imm_ud(0));
3623    brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3624    brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3625    brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3626 
3627    brw_pop_insn_state(p);
3628 }
3629 
3630 
3631 /**
3632  * Emit the SEND message for a barrier
3633  */
3634 void
brw_barrier(struct brw_codegen * p,struct brw_reg src)3635 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3636 {
3637    const struct gen_device_info *devinfo = p->devinfo;
3638    struct brw_inst *inst;
3639 
3640    assert(devinfo->gen >= 7);
3641 
3642    brw_push_insn_state(p);
3643    brw_set_default_access_mode(p, BRW_ALIGN_1);
3644    inst = next_insn(p, BRW_OPCODE_SEND);
3645    brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3646    brw_set_src0(p, inst, src);
3647    brw_set_src1(p, inst, brw_null_reg());
3648 
3649    brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3650                               1 /* msg_length */,
3651                               0 /* response_length */,
3652                               false /* header_present */,
3653                               false /* end_of_thread */);
3654 
3655    brw_inst_set_gateway_notify(devinfo, inst, 1);
3656    brw_inst_set_gateway_subfuncid(devinfo, inst,
3657                                   BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3658 
3659    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3660    brw_pop_insn_state(p);
3661 }
3662 
3663 
3664 /**
3665  * Emit the wait instruction for a barrier
3666  */
3667 void
brw_WAIT(struct brw_codegen * p)3668 brw_WAIT(struct brw_codegen *p)
3669 {
3670    const struct gen_device_info *devinfo = p->devinfo;
3671    struct brw_inst *insn;
3672 
3673    struct brw_reg src = brw_notification_reg();
3674 
3675    insn = next_insn(p, BRW_OPCODE_WAIT);
3676    brw_set_dest(p, insn, src);
3677    brw_set_src0(p, insn, src);
3678    brw_set_src1(p, insn, brw_null_reg());
3679 
3680    brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3681    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3682 }
3683 
3684 /**
3685  * Changes the floating point rounding mode updating the control register
3686  * field defined at cr0.0[5-6] bits. This function supports the changes to
3687  * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations.
3688  * Only RTNE and RTZ rounding are enabled at nir.
3689  */
3690 void
brw_rounding_mode(struct brw_codegen * p,enum brw_rnd_mode mode)3691 brw_rounding_mode(struct brw_codegen *p,
3692                   enum brw_rnd_mode mode)
3693 {
3694    const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT;
3695 
3696    if (bits != BRW_CR0_RND_MODE_MASK) {
3697       brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3698                                brw_imm_ud(~BRW_CR0_RND_MODE_MASK));
3699 
3700       /* From the Skylake PRM, Volume 7, page 760:
3701        *  "Implementation Restriction on Register Access: When the control
3702        *   register is used as an explicit source and/or destination, hardware
3703        *   does not ensure execution pipeline coherency. Software must set the
3704        *   thread control field to ‘switch’ for an instruction that uses
3705        *   control register as an explicit operand."
3706        */
3707       brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3708     }
3709 
3710    if (bits) {
3711       brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3712                               brw_imm_ud(bits));
3713       brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3714    }
3715 }
3716