1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32 #include <string.h>
33
34 #include "brw_context.h"
35 #include "brw_defines.h"
36 #include "brw_eu.h"
37
38 #include "ralloc.h"
39
40 /***********************************************************************
41 * Internal helper for constructing instructions
42 */
43
guess_execution_size(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg reg)44 static void guess_execution_size(struct brw_compile *p,
45 struct brw_instruction *insn,
46 struct brw_reg reg)
47 {
48 if (reg.width == BRW_WIDTH_8 && p->compressed)
49 insn->header.execution_size = BRW_EXECUTE_16;
50 else
51 insn->header.execution_size = reg.width; /* note - definitions are compatible */
52 }
53
54
55 /**
56 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
57 * registers, implicitly moving the operand to a message register.
58 *
59 * On Sandybridge, this is no longer the case. This function performs the
60 * explicit move; it should be called before emitting a SEND instruction.
61 */
62 void
gen6_resolve_implied_move(struct brw_compile * p,struct brw_reg * src,unsigned msg_reg_nr)63 gen6_resolve_implied_move(struct brw_compile *p,
64 struct brw_reg *src,
65 unsigned msg_reg_nr)
66 {
67 struct intel_context *intel = &p->brw->intel;
68 if (intel->gen < 6)
69 return;
70
71 if (src->file == BRW_MESSAGE_REGISTER_FILE)
72 return;
73
74 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
75 brw_push_insn_state(p);
76 brw_set_mask_control(p, BRW_MASK_DISABLE);
77 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
78 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
79 retype(*src, BRW_REGISTER_TYPE_UD));
80 brw_pop_insn_state(p);
81 }
82 *src = brw_message_reg(msg_reg_nr);
83 }
84
85 static void
gen7_convert_mrf_to_grf(struct brw_compile * p,struct brw_reg * reg)86 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
87 {
88 /* From the BSpec / ISA Reference / send - [DevIVB+]:
89 * "The send with EOT should use register space R112-R127 for <src>. This is
90 * to enable loading of a new thread into the same slot while the message
91 * with EOT for current thread is pending dispatch."
92 *
93 * Since we're pretending to have 16 MRFs anyway, we may as well use the
94 * registers required for messages with EOT.
95 */
96 struct intel_context *intel = &p->brw->intel;
97 if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
98 reg->file = BRW_GENERAL_REGISTER_FILE;
99 reg->nr += GEN7_MRF_HACK_START;
100 }
101 }
102
103
104 void
brw_set_dest(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg dest)105 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
106 struct brw_reg dest)
107 {
108 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
109 dest.file != BRW_MESSAGE_REGISTER_FILE)
110 assert(dest.nr < 128);
111
112 gen7_convert_mrf_to_grf(p, &dest);
113
114 insn->bits1.da1.dest_reg_file = dest.file;
115 insn->bits1.da1.dest_reg_type = dest.type;
116 insn->bits1.da1.dest_address_mode = dest.address_mode;
117
118 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
119 insn->bits1.da1.dest_reg_nr = dest.nr;
120
121 if (insn->header.access_mode == BRW_ALIGN_1) {
122 insn->bits1.da1.dest_subreg_nr = dest.subnr;
123 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
124 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
125 insn->bits1.da1.dest_horiz_stride = dest.hstride;
126 }
127 else {
128 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
129 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
130 /* even ignored in da16, still need to set as '01' */
131 insn->bits1.da16.dest_horiz_stride = 1;
132 }
133 }
134 else {
135 insn->bits1.ia1.dest_subreg_nr = dest.subnr;
136
137 /* These are different sizes in align1 vs align16:
138 */
139 if (insn->header.access_mode == BRW_ALIGN_1) {
140 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
141 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
142 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
143 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
144 }
145 else {
146 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
147 /* even ignored in da16, still need to set as '01' */
148 insn->bits1.ia16.dest_horiz_stride = 1;
149 }
150 }
151
152 /* NEW: Set the execution size based on dest.width and
153 * insn->compression_control:
154 */
155 guess_execution_size(p, insn, dest);
156 }
157
158 extern int reg_type_size[];
159
160 static void
validate_reg(struct brw_instruction * insn,struct brw_reg reg)161 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
162 {
163 int hstride_for_reg[] = {0, 1, 2, 4};
164 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
165 int width_for_reg[] = {1, 2, 4, 8, 16};
166 int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
167 int width, hstride, vstride, execsize;
168
169 if (reg.file == BRW_IMMEDIATE_VALUE) {
170 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
171 * mean the destination has to be 128-bit aligned and the
172 * destination horiz stride has to be a word.
173 */
174 if (reg.type == BRW_REGISTER_TYPE_V) {
175 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
176 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
177 }
178
179 return;
180 }
181
182 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
183 reg.nr == BRW_ARF_NULL)
184 return;
185
186 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
187 hstride = hstride_for_reg[reg.hstride];
188
189 if (reg.vstride == 0xf) {
190 vstride = -1;
191 } else {
192 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
193 vstride = vstride_for_reg[reg.vstride];
194 }
195
196 assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
197 width = width_for_reg[reg.width];
198
199 assert(insn->header.execution_size >= 0 &&
200 insn->header.execution_size < Elements(execsize_for_reg));
201 execsize = execsize_for_reg[insn->header.execution_size];
202
203 /* Restrictions from 3.3.10: Register Region Restrictions. */
204 /* 3. */
205 assert(execsize >= width);
206
207 /* FIXME: the assembler has a lot of code written that triggers the
208 * assertions commented it below. Let's paper over it (for now!) until we
209 * can re-validate the shaders with those little inconsistencies fixed. */
210
211 /* 4. */
212 #if 0
213 if (execsize == width && hstride != 0) {
214 assert(vstride == -1 || vstride == width * hstride);
215 }
216 #endif
217
218 /* 5. */
219 if (execsize == width && hstride == 0) {
220 /* no restriction on vstride. */
221 }
222
223 /* 6. */
224 #if 0
225 if (width == 1) {
226 assert(hstride == 0);
227 }
228 #endif
229
230 /* 7. */
231 #if 0
232 if (execsize == 1 && width == 1) {
233 assert(hstride == 0);
234 assert(vstride == 0);
235 }
236 #endif
237
238 /* 8. */
239 if (vstride == 0 && hstride == 0) {
240 assert(width == 1);
241 }
242
243 /* 10. Check destination issues. */
244 }
245
246 void
brw_set_src0(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg reg)247 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
248 struct brw_reg reg)
249 {
250 struct brw_context *brw = p->brw;
251 struct intel_context *intel = &brw->intel;
252
253 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
254 assert(reg.nr < 128);
255
256 gen7_convert_mrf_to_grf(p, ®);
257
258 if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
259 insn->header.opcode == BRW_OPCODE_SENDC)) {
260 /* Any source modifiers or regions will be ignored, since this just
261 * identifies the MRF/GRF to start reading the message contents from.
262 * Check for some likely failures.
263 */
264 assert(!reg.negate);
265 assert(!reg.abs);
266 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
267 }
268
269 validate_reg(insn, reg);
270
271 insn->bits1.da1.src0_reg_file = reg.file;
272 insn->bits1.da1.src0_reg_type = reg.type;
273 insn->bits2.da1.src0_abs = reg.abs;
274 insn->bits2.da1.src0_negate = reg.negate;
275 insn->bits2.da1.src0_address_mode = reg.address_mode;
276
277 if (reg.file == BRW_IMMEDIATE_VALUE) {
278 insn->bits3.ud = reg.dw1.ud;
279
280 /* Required to set some fields in src1 as well:
281 */
282
283 /* FIXME: This looks quite wrong, tempering with src1. I did not find
284 * anything in the bspec that was hinting it woud be needed when setting
285 * src0. before removing this one needs to run piglit.
286
287 insn->bits1.da1.src1_reg_file = 0;
288 insn->bits1.da1.src1_reg_type = reg.type;
289 */
290 }
291 else
292 {
293 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
294 if (insn->header.access_mode == BRW_ALIGN_1) {
295 insn->bits2.da1.src0_subreg_nr = reg.subnr;
296 insn->bits2.da1.src0_reg_nr = reg.nr;
297 }
298 else {
299 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
300 insn->bits2.da16.src0_reg_nr = reg.nr;
301 }
302 }
303 else {
304 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
305
306 if (insn->header.access_mode == BRW_ALIGN_1) {
307 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
308 }
309 else {
310 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
311 }
312 }
313
314 if (insn->header.access_mode == BRW_ALIGN_1) {
315
316 /* FIXME: While this is correct, if the assembler uses that code path
317 * the opcode generated are different and thus needs a validation
318 * pass.
319 if (reg.width == BRW_WIDTH_1 &&
320 insn->header.execution_size == BRW_EXECUTE_1) {
321 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
322 insn->bits2.da1.src0_width = BRW_WIDTH_1;
323 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
324 }
325 else {
326 */
327 insn->bits2.da1.src0_horiz_stride = reg.hstride;
328 insn->bits2.da1.src0_width = reg.width;
329 insn->bits2.da1.src0_vert_stride = reg.vstride;
330 /* } */
331 }
332 else {
333 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
334 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
335 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
336 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
337
338 /* This is an oddity of the fact we're using the same
339 * descriptions for registers in align_16 as align_1:
340 */
341 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
342 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
343 else
344 insn->bits2.da16.src0_vert_stride = reg.vstride;
345 }
346 }
347 }
348
349
brw_set_src1(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg reg)350 void brw_set_src1(struct brw_compile *p,
351 struct brw_instruction *insn,
352 struct brw_reg reg)
353 {
354 struct brw_context *brw = p->brw;
355 struct intel_context *intel = &brw->intel;
356
357 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
358
359 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
360 assert(reg.nr < 128);
361
362 gen7_convert_mrf_to_grf(p, ®);
363
364 validate_reg(insn, reg);
365
366 insn->bits1.da1.src1_reg_file = reg.file;
367 insn->bits1.da1.src1_reg_type = reg.type;
368 insn->bits3.da1.src1_abs = reg.abs;
369 insn->bits3.da1.src1_negate = reg.negate;
370 insn->bits3.da1.src1_address_mode = reg.address_mode;
371
372 /* Only src1 can be immediate in two-argument instructions.
373 */
374 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
375
376 if (reg.file == BRW_IMMEDIATE_VALUE) {
377 insn->bits3.ud = reg.dw1.ud;
378 }
379 else {
380 /* It's only BRW that does not support register-indirect addressing on
381 * src1 */
382 assert (intel->gen >= 4 || reg.address_mode == BRW_ADDRESS_DIRECT);
383
384 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
385 if (insn->header.access_mode == BRW_ALIGN_1) {
386 insn->bits3.da1.src1_subreg_nr = reg.subnr;
387 insn->bits3.da1.src1_reg_nr = reg.nr;
388 }
389 else {
390 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
391 insn->bits3.da16.src1_reg_nr = reg.nr;
392 }
393 }
394 else {
395 insn->bits3.ia1.src1_subreg_nr = reg.subnr;
396
397 if (insn->header.access_mode == BRW_ALIGN_1)
398 insn->bits3.ia1.src1_indirect_offset = reg.dw1.bits.indirect_offset;
399 else
400 insn->bits3.ia16.src1_indirect_offset = reg.dw1.bits.indirect_offset / 16;
401 }
402
403 if (insn->header.access_mode == BRW_ALIGN_1) {
404 /* FIXME: While this is correct, if the assembler uses that code path
405 * the opcode generated are different and thus needs a validation
406 * pass.
407 if (reg.width == BRW_WIDTH_1 &&
408 insn->header.execution_size == BRW_EXECUTE_1) {
409 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
410 insn->bits3.da1.src1_width = BRW_WIDTH_1;
411 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
412 }
413 else { */
414 insn->bits3.da1.src1_horiz_stride = reg.hstride;
415 insn->bits3.da1.src1_width = reg.width;
416 insn->bits3.da1.src1_vert_stride = reg.vstride;
417 /* } */
418 }
419 else {
420 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
421 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
422 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
423 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
424
425 /* This is an oddity of the fact we're using the same
426 * descriptions for registers in align_16 as align_1:
427 */
428 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
429 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
430 else
431 insn->bits3.da16.src1_vert_stride = reg.vstride;
432 }
433 }
434 }
435
436 /**
437 * Set the Message Descriptor and Extended Message Descriptor fields
438 * for SEND messages.
439 *
440 * \note This zeroes out the Function Control bits, so it must be called
441 * \b before filling out any message-specific data. Callers can
442 * choose not to fill in irrelevant bits; they will be zero.
443 */
444 static void
brw_set_message_descriptor(struct brw_compile * p,struct brw_instruction * inst,enum brw_message_target sfid,unsigned msg_length,unsigned response_length,bool header_present,bool end_of_thread)445 brw_set_message_descriptor(struct brw_compile *p,
446 struct brw_instruction *inst,
447 enum brw_message_target sfid,
448 unsigned msg_length,
449 unsigned response_length,
450 bool header_present,
451 bool end_of_thread)
452 {
453 struct intel_context *intel = &p->brw->intel;
454
455 brw_set_src1(p, inst, brw_imm_d(0));
456
457 if (intel->gen >= 5) {
458 inst->bits3.generic_gen5.header_present = header_present;
459 inst->bits3.generic_gen5.response_length = response_length;
460 inst->bits3.generic_gen5.msg_length = msg_length;
461 inst->bits3.generic_gen5.end_of_thread = end_of_thread;
462
463 if (intel->gen >= 6) {
464 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
465 inst->header.destreg__conditionalmod = sfid;
466 } else {
467 /* Set Extended Message Descriptor (ex_desc) */
468 inst->bits2.send_gen5.sfid = sfid;
469 inst->bits2.send_gen5.end_of_thread = end_of_thread;
470 }
471 } else {
472 inst->bits3.generic.response_length = response_length;
473 inst->bits3.generic.msg_length = msg_length;
474 inst->bits3.generic.msg_target = sfid;
475 inst->bits3.generic.end_of_thread = end_of_thread;
476 }
477 }
478
brw_set_math_message(struct brw_compile * p,struct brw_instruction * insn,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)479 static void brw_set_math_message( struct brw_compile *p,
480 struct brw_instruction *insn,
481 unsigned function,
482 unsigned integer_type,
483 bool low_precision,
484 unsigned dataType )
485 {
486 struct brw_context *brw = p->brw;
487 struct intel_context *intel = &brw->intel;
488 unsigned msg_length;
489 unsigned response_length;
490
491 /* Infer message length from the function */
492 switch (function) {
493 case BRW_MATH_FUNCTION_POW:
494 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
495 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
496 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497 msg_length = 2;
498 break;
499 default:
500 msg_length = 1;
501 break;
502 }
503
504 /* Infer response length from the function */
505 switch (function) {
506 case BRW_MATH_FUNCTION_SINCOS:
507 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
508 response_length = 2;
509 break;
510 default:
511 response_length = 1;
512 break;
513 }
514
515
516 brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
517 msg_length, response_length, false, false);
518 if (intel->gen == 5) {
519 insn->bits3.math_gen5.function = function;
520 insn->bits3.math_gen5.int_type = integer_type;
521 insn->bits3.math_gen5.precision = low_precision;
522 insn->bits3.math_gen5.saturate = insn->header.saturate;
523 insn->bits3.math_gen5.data_type = dataType;
524 insn->bits3.math_gen5.snapshot = 0;
525 } else {
526 insn->bits3.math.function = function;
527 insn->bits3.math.int_type = integer_type;
528 insn->bits3.math.precision = low_precision;
529 insn->bits3.math.saturate = insn->header.saturate;
530 insn->bits3.math.data_type = dataType;
531 }
532 insn->header.saturate = 0;
533 }
534
535
brw_set_ff_sync_message(struct brw_compile * p,struct brw_instruction * insn,bool allocate,unsigned response_length,bool end_of_thread)536 static void brw_set_ff_sync_message(struct brw_compile *p,
537 struct brw_instruction *insn,
538 bool allocate,
539 unsigned response_length,
540 bool end_of_thread)
541 {
542 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
543 1, response_length, true, end_of_thread);
544 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
545 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
546 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
547 insn->bits3.urb_gen5.allocate = allocate;
548 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
549 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
550 }
551
brw_set_urb_message(struct brw_compile * p,struct brw_instruction * insn,bool allocate,bool used,unsigned msg_length,unsigned response_length,bool end_of_thread,bool complete,unsigned offset,unsigned swizzle_control)552 static void brw_set_urb_message( struct brw_compile *p,
553 struct brw_instruction *insn,
554 bool allocate,
555 bool used,
556 unsigned msg_length,
557 unsigned response_length,
558 bool end_of_thread,
559 bool complete,
560 unsigned offset,
561 unsigned swizzle_control )
562 {
563 struct brw_context *brw = p->brw;
564 struct intel_context *intel = &brw->intel;
565
566 brw_set_message_descriptor(p, insn, BRW_SFID_URB,
567 msg_length, response_length, true, end_of_thread);
568 if (intel->gen == 7) {
569 insn->bits3.urb_gen7.opcode = 0; /* URB_WRITE_HWORD */
570 insn->bits3.urb_gen7.offset = offset;
571 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
572 insn->bits3.urb_gen7.swizzle_control = swizzle_control;
573 /* per_slot_offset = 0 makes it ignore offsets in message header */
574 insn->bits3.urb_gen7.per_slot_offset = 0;
575 insn->bits3.urb_gen7.complete = complete;
576 } else if (intel->gen >= 5) {
577 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
578 insn->bits3.urb_gen5.offset = offset;
579 insn->bits3.urb_gen5.swizzle_control = swizzle_control;
580 insn->bits3.urb_gen5.allocate = allocate;
581 insn->bits3.urb_gen5.used = used; /* ? */
582 insn->bits3.urb_gen5.complete = complete;
583 } else {
584 insn->bits3.urb.opcode = 0; /* ? */
585 insn->bits3.urb.offset = offset;
586 insn->bits3.urb.swizzle_control = swizzle_control;
587 insn->bits3.urb.allocate = allocate;
588 insn->bits3.urb.used = used; /* ? */
589 insn->bits3.urb.complete = complete;
590 }
591 }
592
593 void
brw_set_dp_write_message(struct brw_compile * p,struct brw_instruction * insn,unsigned binding_table_index,unsigned msg_control,unsigned msg_type,unsigned msg_length,bool header_present,unsigned last_render_target,unsigned response_length,unsigned end_of_thread,unsigned send_commit_msg)594 brw_set_dp_write_message(struct brw_compile *p,
595 struct brw_instruction *insn,
596 unsigned binding_table_index,
597 unsigned msg_control,
598 unsigned msg_type,
599 unsigned msg_length,
600 bool header_present,
601 unsigned last_render_target,
602 unsigned response_length,
603 unsigned end_of_thread,
604 unsigned send_commit_msg)
605 {
606 struct brw_context *brw = p->brw;
607 struct intel_context *intel = &brw->intel;
608 unsigned sfid;
609
610 if (intel->gen >= 7) {
611 /* Use the Render Cache for RT writes; otherwise use the Data Cache */
612 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
613 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
614 else
615 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
616 } else if (intel->gen == 6) {
617 /* Use the render cache for all write messages. */
618 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
619 } else {
620 sfid = BRW_SFID_DATAPORT_WRITE;
621 }
622
623 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
624 header_present, end_of_thread);
625
626 if (intel->gen >= 7) {
627 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
628 insn->bits3.gen7_dp.msg_control = msg_control |
629 last_render_target << 6;
630 insn->bits3.gen7_dp.msg_type = msg_type;
631 } else if (intel->gen == 6) {
632 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
633 insn->bits3.gen6_dp.msg_control = msg_control |
634 last_render_target << 5;
635 insn->bits3.gen6_dp.msg_type = msg_type;
636 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
637 } else if (intel->gen == 5) {
638 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
639 insn->bits3.dp_write_gen5.msg_control = msg_control;
640 insn->bits3.dp_write_gen5.last_render_target = last_render_target;
641 insn->bits3.dp_write_gen5.msg_type = msg_type;
642 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
643 } else {
644 insn->bits3.dp_write.binding_table_index = binding_table_index;
645 insn->bits3.dp_write.msg_control = msg_control;
646 insn->bits3.dp_write.last_render_target = last_render_target;
647 insn->bits3.dp_write.msg_type = msg_type;
648 insn->bits3.dp_write.send_commit_msg = send_commit_msg;
649 }
650 }
651
652 void
brw_set_dp_read_message(struct brw_compile * p,struct brw_instruction * insn,unsigned binding_table_index,unsigned msg_control,unsigned msg_type,unsigned target_cache,unsigned msg_length,bool header_present,unsigned response_length)653 brw_set_dp_read_message(struct brw_compile *p,
654 struct brw_instruction *insn,
655 unsigned binding_table_index,
656 unsigned msg_control,
657 unsigned msg_type,
658 unsigned target_cache,
659 unsigned msg_length,
660 bool header_present,
661 unsigned response_length)
662 {
663 struct brw_context *brw = p->brw;
664 struct intel_context *intel = &brw->intel;
665 unsigned sfid;
666
667 if (intel->gen >= 7) {
668 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
669 } else if (intel->gen == 6) {
670 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
671 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
672 else
673 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
674 } else {
675 sfid = BRW_SFID_DATAPORT_READ;
676 }
677
678 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
679 header_present, false);
680
681 if (intel->gen >= 7) {
682 insn->bits3.gen7_dp.binding_table_index = binding_table_index;
683 insn->bits3.gen7_dp.msg_control = msg_control;
684 insn->bits3.gen7_dp.msg_type = msg_type;
685 } else if (intel->gen == 6) {
686 insn->bits3.gen6_dp.binding_table_index = binding_table_index;
687 insn->bits3.gen6_dp.msg_control = msg_control;
688 insn->bits3.gen6_dp.msg_type = msg_type;
689 insn->bits3.gen6_dp.send_commit_msg = 0;
690 } else if (intel->gen == 5) {
691 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
692 insn->bits3.dp_read_gen5.msg_control = msg_control;
693 insn->bits3.dp_read_gen5.msg_type = msg_type;
694 insn->bits3.dp_read_gen5.target_cache = target_cache;
695 } else if (intel->is_g4x) {
696 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
697 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/
698 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/
699 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/
700 } else {
701 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
702 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/
703 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/
704 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/
705 }
706 }
707
708 void
brw_set_sampler_message(struct brw_compile * p,struct brw_instruction * insn,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)709 brw_set_sampler_message(struct brw_compile *p,
710 struct brw_instruction *insn,
711 unsigned binding_table_index,
712 unsigned sampler,
713 unsigned msg_type,
714 unsigned response_length,
715 unsigned msg_length,
716 unsigned header_present,
717 unsigned simd_mode,
718 unsigned return_format)
719 {
720 struct brw_context *brw = p->brw;
721 struct intel_context *intel = &brw->intel;
722
723 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
724 response_length, header_present, false);
725
726 if (intel->gen >= 7) {
727 insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
728 insn->bits3.sampler_gen7.sampler = sampler;
729 insn->bits3.sampler_gen7.msg_type = msg_type;
730 insn->bits3.sampler_gen7.simd_mode = simd_mode;
731 } else if (intel->gen >= 5) {
732 insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
733 insn->bits3.sampler_gen5.sampler = sampler;
734 insn->bits3.sampler_gen5.msg_type = msg_type;
735 insn->bits3.sampler_gen5.simd_mode = simd_mode;
736 } else if (intel->is_g4x) {
737 insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
738 insn->bits3.sampler_g4x.sampler = sampler;
739 insn->bits3.sampler_g4x.msg_type = msg_type;
740 } else {
741 insn->bits3.sampler.binding_table_index = binding_table_index;
742 insn->bits3.sampler.sampler = sampler;
743 insn->bits3.sampler.msg_type = msg_type;
744 insn->bits3.sampler.return_format = return_format;
745 }
746 }
747
748
749 #define next_insn brw_next_insn
750 struct brw_instruction *
brw_next_insn(struct brw_compile * p,unsigned opcode)751 brw_next_insn(struct brw_compile *p, unsigned opcode)
752 {
753 struct brw_instruction *insn;
754
755 if (p->nr_insn + 1 > p->store_size) {
756 if (0)
757 printf("incresing the store size to %d\n", p->store_size << 1);
758 p->store_size <<= 1;
759 p->store = reralloc(p->mem_ctx, p->store,
760 struct brw_instruction, p->store_size);
761 if (!p->store)
762 assert(!"realloc eu store memeory failed");
763 }
764
765 p->next_insn_offset += 16;
766 insn = &p->store[p->nr_insn++];
767 memcpy(insn, p->current, sizeof(*insn));
768
769 /* Reset this one-shot flag:
770 */
771
772 if (p->current->header.destreg__conditionalmod) {
773 p->current->header.destreg__conditionalmod = 0;
774 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
775 }
776
777 insn->header.opcode = opcode;
778 return insn;
779 }
780
brw_alu1(struct brw_compile * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)781 static struct brw_instruction *brw_alu1( struct brw_compile *p,
782 unsigned opcode,
783 struct brw_reg dest,
784 struct brw_reg src )
785 {
786 struct brw_instruction *insn = next_insn(p, opcode);
787 brw_set_dest(p, insn, dest);
788 brw_set_src0(p, insn, src);
789 return insn;
790 }
791
brw_alu2(struct brw_compile * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)792 static struct brw_instruction *brw_alu2(struct brw_compile *p,
793 unsigned opcode,
794 struct brw_reg dest,
795 struct brw_reg src0,
796 struct brw_reg src1 )
797 {
798 struct brw_instruction *insn = next_insn(p, opcode);
799 brw_set_dest(p, insn, dest);
800 brw_set_src0(p, insn, src0);
801 brw_set_src1(p, insn, src1);
802 return insn;
803 }
804
805 static int
get_3src_subreg_nr(struct brw_reg reg)806 get_3src_subreg_nr(struct brw_reg reg)
807 {
808 if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
809 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
810 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
811 } else {
812 return reg.subnr / 4;
813 }
814 }
815
get_3src_type(int type)816 static int get_3src_type(int type)
817 {
818 assert(type == BRW_REGISTER_TYPE_F ||
819 type == BRW_REGISTER_TYPE_D ||
820 type == BRW_REGISTER_TYPE_UD);
821
822 switch(type) {
823 case BRW_REGISTER_TYPE_F: return BRW_REGISTER_3SRC_TYPE_F;
824 case BRW_REGISTER_TYPE_D: return BRW_REGISTER_3SRC_TYPE_D;
825 case BRW_REGISTER_TYPE_UD: return BRW_REGISTER_3SRC_TYPE_UD;
826 }
827
828 return BRW_REGISTER_3SRC_TYPE_F;
829 }
830
831 void
brw_set_3src_dest(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg dest)832 brw_set_3src_dest(struct brw_compile *p,
833 struct brw_instruction *insn,
834 struct brw_reg dest)
835 {
836 gen7_convert_mrf_to_grf(p, &dest);
837
838 assert(insn->header.access_mode == BRW_ALIGN_16);
839
840 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
841 dest.file == BRW_MESSAGE_REGISTER_FILE);
842 assert(dest.nr < 128);
843 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
844 insn->bits1.da3src.dest_reg_type = get_3src_type(dest.type);
845 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
846 insn->bits1.da3src.dest_reg_nr = dest.nr;
847 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
848 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
849 guess_execution_size(p, insn, dest);
850 }
851
852 void
brw_set_3src_src0(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg src0)853 brw_set_3src_src0(struct brw_compile *p,
854 struct brw_instruction *insn,
855 struct brw_reg src0)
856 {
857 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
858 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
859 assert(src0.nr < 128);
860 insn->bits1.da3src.src_reg_type = get_3src_type(src0.type);
861 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
862 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
863 insn->bits2.da3src.src0_reg_nr = src0.nr;
864 insn->bits1.da3src.src0_abs = src0.abs;
865 insn->bits1.da3src.src0_negate = src0.negate;
866 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
867 }
868
869 void
brw_set_3src_src1(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg src1)870 brw_set_3src_src1(struct brw_compile *p,
871 struct brw_instruction *insn,
872 struct brw_reg src1)
873 {
874 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
875 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
876 assert(src1.nr < 128);
877 assert(get_3src_type(src1.type) == insn->bits1.da3src.src_reg_type);
878 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
879 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
880 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
881 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
882 insn->bits3.da3src.src1_reg_nr = src1.nr;
883 insn->bits1.da3src.src1_abs = src1.abs;
884 insn->bits1.da3src.src1_negate = src1.negate;
885 }
886
887 void
brw_set_3src_src2(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg src2)888 brw_set_3src_src2(struct brw_compile *p,
889 struct brw_instruction *insn,
890 struct brw_reg src2)
891 {
892 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
893 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
894 assert(src2.nr < 128);
895 assert(get_3src_type(src2.type) == insn->bits1.da3src.src_reg_type);
896 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
897 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
898 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
899 insn->bits3.da3src.src2_reg_nr = src2.nr;
900 insn->bits1.da3src.src2_abs = src2.abs;
901 insn->bits1.da3src.src2_negate = src2.negate;
902 }
903
brw_alu3(struct brw_compile * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)904 static struct brw_instruction *brw_alu3(struct brw_compile *p,
905 unsigned opcode,
906 struct brw_reg dest,
907 struct brw_reg src0,
908 struct brw_reg src1,
909 struct brw_reg src2)
910 {
911 struct brw_instruction *insn = next_insn(p, opcode);
912 brw_set_3src_dest(p, insn, dest);
913 brw_set_3src_src0(p, insn, src0);
914 brw_set_3src_src1(p, insn, src1);
915 brw_set_3src_src2(p, insn, src2);
916 return insn;
917 }
918
919
920 /***********************************************************************
921 * Convenience routines.
922 */
923 #define ALU1(OP) \
924 struct brw_instruction *brw_##OP(struct brw_compile *p, \
925 struct brw_reg dest, \
926 struct brw_reg src0) \
927 { \
928 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
929 }
930
931 #define ALU2(OP) \
932 struct brw_instruction *brw_##OP(struct brw_compile *p, \
933 struct brw_reg dest, \
934 struct brw_reg src0, \
935 struct brw_reg src1) \
936 { \
937 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
938 }
939
940 #define ALU3(OP) \
941 struct brw_instruction *brw_##OP(struct brw_compile *p, \
942 struct brw_reg dest, \
943 struct brw_reg src0, \
944 struct brw_reg src1, \
945 struct brw_reg src2) \
946 { \
947 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
948 }
949
950 /* Rounding operations (other than RNDD) require two instructions - the first
951 * stores a rounded value (possibly the wrong way) in the dest register, but
952 * also sets a per-channel "increment bit" in the flag register. A predicated
953 * add of 1.0 fixes dest to contain the desired result.
954 *
955 * Sandybridge and later appear to round correctly without an ADD.
956 */
957 #define ROUND(OP) \
958 void brw_##OP(struct brw_compile *p, \
959 struct brw_reg dest, \
960 struct brw_reg src) \
961 { \
962 struct brw_instruction *rnd, *add; \
963 rnd = next_insn(p, BRW_OPCODE_##OP); \
964 brw_set_dest(p, rnd, dest); \
965 brw_set_src0(p, rnd, src); \
966 \
967 if (p->brw->intel.gen < 6) { \
968 /* turn on round-increments */ \
969 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
970 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
971 add->header.predicate_control = BRW_PREDICATE_NORMAL; \
972 } \
973 }
974
975
976 ALU1(MOV)
ALU2(SEL)977 ALU2(SEL)
978 ALU1(NOT)
979 ALU2(AND)
980 ALU2(OR)
981 ALU2(XOR)
982 ALU2(SHR)
983 ALU2(SHL)
984 ALU2(RSR)
985 ALU2(RSL)
986 ALU2(ASR)
987 ALU1(FRC)
988 ALU1(RNDD)
989 ALU2(MAC)
990 ALU2(MACH)
991 ALU1(LZD)
992 ALU2(DP4)
993 ALU2(DPH)
994 ALU2(DP3)
995 ALU2(DP2)
996 ALU2(LINE)
997 ALU2(PLN)
998 ALU3(MAD)
999
1000 ROUND(RNDZ)
1001 ROUND(RNDE)
1002
1003
1004 struct brw_instruction *brw_ADD(struct brw_compile *p,
1005 struct brw_reg dest,
1006 struct brw_reg src0,
1007 struct brw_reg src1)
1008 {
1009 /* 6.2.2: add */
1010 if (src0.type == BRW_REGISTER_TYPE_F ||
1011 (src0.file == BRW_IMMEDIATE_VALUE &&
1012 src0.type == BRW_REGISTER_TYPE_VF)) {
1013 assert(src1.type != BRW_REGISTER_TYPE_UD);
1014 assert(src1.type != BRW_REGISTER_TYPE_D);
1015 }
1016
1017 if (src1.type == BRW_REGISTER_TYPE_F ||
1018 (src1.file == BRW_IMMEDIATE_VALUE &&
1019 src1.type == BRW_REGISTER_TYPE_VF)) {
1020 assert(src0.type != BRW_REGISTER_TYPE_UD);
1021 assert(src0.type != BRW_REGISTER_TYPE_D);
1022 }
1023
1024 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1025 }
1026
brw_AVG(struct brw_compile * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1027 struct brw_instruction *brw_AVG(struct brw_compile *p,
1028 struct brw_reg dest,
1029 struct brw_reg src0,
1030 struct brw_reg src1)
1031 {
1032 assert(dest.type == src0.type);
1033 assert(src0.type == src1.type);
1034 switch (src0.type) {
1035 case BRW_REGISTER_TYPE_B:
1036 case BRW_REGISTER_TYPE_UB:
1037 case BRW_REGISTER_TYPE_W:
1038 case BRW_REGISTER_TYPE_UW:
1039 case BRW_REGISTER_TYPE_D:
1040 case BRW_REGISTER_TYPE_UD:
1041 break;
1042 default:
1043 assert(!"Bad type for brw_AVG");
1044 }
1045
1046 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1047 }
1048
brw_MUL(struct brw_compile * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1049 struct brw_instruction *brw_MUL(struct brw_compile *p,
1050 struct brw_reg dest,
1051 struct brw_reg src0,
1052 struct brw_reg src1)
1053 {
1054 /* 6.32.38: mul */
1055 if (src0.type == BRW_REGISTER_TYPE_D ||
1056 src0.type == BRW_REGISTER_TYPE_UD ||
1057 src1.type == BRW_REGISTER_TYPE_D ||
1058 src1.type == BRW_REGISTER_TYPE_UD) {
1059 assert(dest.type != BRW_REGISTER_TYPE_F);
1060 }
1061
1062 if (src0.type == BRW_REGISTER_TYPE_F ||
1063 (src0.file == BRW_IMMEDIATE_VALUE &&
1064 src0.type == BRW_REGISTER_TYPE_VF)) {
1065 assert(src1.type != BRW_REGISTER_TYPE_UD);
1066 assert(src1.type != BRW_REGISTER_TYPE_D);
1067 }
1068
1069 if (src1.type == BRW_REGISTER_TYPE_F ||
1070 (src1.file == BRW_IMMEDIATE_VALUE &&
1071 src1.type == BRW_REGISTER_TYPE_VF)) {
1072 assert(src0.type != BRW_REGISTER_TYPE_UD);
1073 assert(src0.type != BRW_REGISTER_TYPE_D);
1074 }
1075
1076 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1077 src0.nr != BRW_ARF_ACCUMULATOR);
1078 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1079 src1.nr != BRW_ARF_ACCUMULATOR);
1080
1081 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1082 }
1083
1084
brw_NOP(struct brw_compile * p)1085 void brw_NOP(struct brw_compile *p)
1086 {
1087 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1088 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1089 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1090 brw_set_src1(p, insn, brw_imm_ud(0x0));
1091 }
1092
1093
1094
1095
1096
1097 /***********************************************************************
1098 * Comparisons, if/else/endif
1099 */
1100
brw_JMPI(struct brw_compile * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1101 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1102 struct brw_reg dest,
1103 struct brw_reg src0,
1104 struct brw_reg src1)
1105 {
1106 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1107
1108 insn->header.execution_size = 1;
1109 insn->header.compression_control = BRW_COMPRESSION_NONE;
1110 insn->header.mask_control = BRW_MASK_DISABLE;
1111
1112 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1113
1114 return insn;
1115 }
1116
1117 static void
push_if_stack(struct brw_compile * p,struct brw_instruction * inst)1118 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1119 {
1120 p->if_stack[p->if_stack_depth] = inst - p->store;
1121
1122 p->if_stack_depth++;
1123 if (p->if_stack_array_size <= p->if_stack_depth) {
1124 p->if_stack_array_size *= 2;
1125 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1126 p->if_stack_array_size);
1127 }
1128 }
1129
1130 static struct brw_instruction *
pop_if_stack(struct brw_compile * p)1131 pop_if_stack(struct brw_compile *p)
1132 {
1133 p->if_stack_depth--;
1134 return &p->store[p->if_stack[p->if_stack_depth]];
1135 }
1136
1137 static void
push_loop_stack(struct brw_compile * p,struct brw_instruction * inst)1138 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1139 {
1140 if (p->loop_stack_array_size < p->loop_stack_depth) {
1141 p->loop_stack_array_size *= 2;
1142 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1143 p->loop_stack_array_size);
1144 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1145 p->loop_stack_array_size);
1146 }
1147
1148 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1149 p->loop_stack_depth++;
1150 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1151 }
1152
1153 static struct brw_instruction *
get_inner_do_insn(struct brw_compile * p)1154 get_inner_do_insn(struct brw_compile *p)
1155 {
1156 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1157 }
1158
1159 /* EU takes the value from the flag register and pushes it onto some
1160 * sort of a stack (presumably merging with any flag value already on
1161 * the stack). Within an if block, the flags at the top of the stack
1162 * control execution on each channel of the unit, eg. on each of the
1163 * 16 pixel values in our wm programs.
1164 *
1165 * When the matching 'else' instruction is reached (presumably by
1166 * countdown of the instruction count patched in by our ELSE/ENDIF
1167 * functions), the relevent flags are inverted.
1168 *
1169 * When the matching 'endif' instruction is reached, the flags are
1170 * popped off. If the stack is now empty, normal execution resumes.
1171 */
1172 struct brw_instruction *
brw_IF(struct brw_compile * p,unsigned execute_size)1173 brw_IF(struct brw_compile *p, unsigned execute_size)
1174 {
1175 struct intel_context *intel = &p->brw->intel;
1176 struct brw_instruction *insn;
1177
1178 insn = next_insn(p, BRW_OPCODE_IF);
1179
1180 /* Override the defaults for this instruction:
1181 */
1182 if (intel->gen < 6) {
1183 brw_set_dest(p, insn, brw_ip_reg());
1184 brw_set_src0(p, insn, brw_ip_reg());
1185 brw_set_src1(p, insn, brw_imm_d(0x0));
1186 } else if (intel->gen == 6) {
1187 brw_set_dest(p, insn, brw_imm_w(0));
1188 insn->bits1.branch_gen6.jump_count = 0;
1189 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1190 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1191 } else {
1192 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1193 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1194 brw_set_src1(p, insn, brw_imm_ud(0));
1195 insn->bits3.break_cont.jip = 0;
1196 insn->bits3.break_cont.uip = 0;
1197 }
1198
1199 insn->header.execution_size = execute_size;
1200 insn->header.compression_control = BRW_COMPRESSION_NONE;
1201 insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1202 insn->header.mask_control = BRW_MASK_ENABLE;
1203 if (!p->single_program_flow)
1204 insn->header.thread_control = BRW_THREAD_SWITCH;
1205
1206 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1207
1208 push_if_stack(p, insn);
1209 p->if_depth_in_loop[p->loop_stack_depth]++;
1210 return insn;
1211 }
1212
1213 /* This function is only used for gen6-style IF instructions with an
1214 * embedded comparison (conditional modifier). It is not used on gen7.
1215 */
1216 struct brw_instruction *
gen6_IF(struct brw_compile * p,uint32_t conditional,struct brw_reg src0,struct brw_reg src1)1217 gen6_IF(struct brw_compile *p, uint32_t conditional,
1218 struct brw_reg src0, struct brw_reg src1)
1219 {
1220 struct brw_instruction *insn;
1221
1222 insn = next_insn(p, BRW_OPCODE_IF);
1223
1224 brw_set_dest(p, insn, brw_imm_w(0));
1225 if (p->compressed) {
1226 insn->header.execution_size = BRW_EXECUTE_16;
1227 } else {
1228 insn->header.execution_size = BRW_EXECUTE_8;
1229 }
1230 insn->bits1.branch_gen6.jump_count = 0;
1231 brw_set_src0(p, insn, src0);
1232 brw_set_src1(p, insn, src1);
1233
1234 assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1235 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1236 insn->header.destreg__conditionalmod = conditional;
1237
1238 if (!p->single_program_flow)
1239 insn->header.thread_control = BRW_THREAD_SWITCH;
1240
1241 push_if_stack(p, insn);
1242 return insn;
1243 }
1244
1245 /**
1246 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1247 */
1248 static void
convert_IF_ELSE_to_ADD(struct brw_compile * p,struct brw_instruction * if_inst,struct brw_instruction * else_inst)1249 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1250 struct brw_instruction *if_inst,
1251 struct brw_instruction *else_inst)
1252 {
1253 /* The next instruction (where the ENDIF would be, if it existed) */
1254 struct brw_instruction *next_inst = &p->store[p->nr_insn];
1255
1256 assert(p->single_program_flow);
1257 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1258 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1259 assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1260
1261 /* Convert IF to an ADD instruction that moves the instruction pointer
1262 * to the first instruction of the ELSE block. If there is no ELSE
1263 * block, point to where ENDIF would be. Reverse the predicate.
1264 *
1265 * There's no need to execute an ENDIF since we don't need to do any
1266 * stack operations, and if we're currently executing, we just want to
1267 * continue normally.
1268 */
1269 if_inst->header.opcode = BRW_OPCODE_ADD;
1270 if_inst->header.predicate_inverse = 1;
1271
1272 if (else_inst != NULL) {
1273 /* Convert ELSE to an ADD instruction that points where the ENDIF
1274 * would be.
1275 */
1276 else_inst->header.opcode = BRW_OPCODE_ADD;
1277
1278 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1279 else_inst->bits3.ud = (next_inst - else_inst) * 16;
1280 } else {
1281 if_inst->bits3.ud = (next_inst - if_inst) * 16;
1282 }
1283 }
1284
1285 /**
1286 * Patch IF and ELSE instructions with appropriate jump targets.
1287 */
1288 static void
patch_IF_ELSE(struct brw_compile * p,struct brw_instruction * if_inst,struct brw_instruction * else_inst,struct brw_instruction * endif_inst)1289 patch_IF_ELSE(struct brw_compile *p,
1290 struct brw_instruction *if_inst,
1291 struct brw_instruction *else_inst,
1292 struct brw_instruction *endif_inst)
1293 {
1294 struct intel_context *intel = &p->brw->intel;
1295
1296 /* We shouldn't be patching IF and ELSE instructions in single program flow
1297 * mode when gen < 6, because in single program flow mode on those
1298 * platforms, we convert flow control instructions to conditional ADDs that
1299 * operate on IP (see brw_ENDIF).
1300 *
1301 * However, on Gen6, writing to IP doesn't work in single program flow mode
1302 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1303 * not be updated by non-flow control instructions."). And on later
1304 * platforms, there is no significant benefit to converting control flow
1305 * instructions to conditional ADDs. So we do patch IF and ELSE
1306 * instructions in single program flow mode on those platforms.
1307 */
1308 if (intel->gen < 6)
1309 assert(!p->single_program_flow);
1310
1311 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1312 assert(endif_inst != NULL);
1313 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1314
1315 unsigned br = 1;
1316 /* Jump count is for 64bit data chunk each, so one 128bit instruction
1317 * requires 2 chunks.
1318 */
1319 if (intel->gen >= 5)
1320 br = 2;
1321
1322 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1323 endif_inst->header.execution_size = if_inst->header.execution_size;
1324
1325 if (else_inst == NULL) {
1326 /* Patch IF -> ENDIF */
1327 if (intel->gen < 6) {
1328 /* Turn it into an IFF, which means no mask stack operations for
1329 * all-false and jumping past the ENDIF.
1330 */
1331 if_inst->header.opcode = BRW_OPCODE_IFF;
1332 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1333 if_inst->bits3.if_else.pop_count = 0;
1334 if_inst->bits3.if_else.pad0 = 0;
1335 } else if (intel->gen == 6) {
1336 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1337 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1338 } else {
1339 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1340 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1341 }
1342 } else {
1343 else_inst->header.execution_size = if_inst->header.execution_size;
1344
1345 /* Patch IF -> ELSE */
1346 if (intel->gen < 6) {
1347 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1348 if_inst->bits3.if_else.pop_count = 0;
1349 if_inst->bits3.if_else.pad0 = 0;
1350 } else if (intel->gen == 6) {
1351 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1352 }
1353
1354 /* Patch ELSE -> ENDIF */
1355 if (intel->gen < 6) {
1356 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1357 * matching ENDIF.
1358 */
1359 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1360 else_inst->bits3.if_else.pop_count = 1;
1361 else_inst->bits3.if_else.pad0 = 0;
1362 } else if (intel->gen == 6) {
1363 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1364 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1365 } else {
1366 /* The IF instruction's JIP should point just past the ELSE */
1367 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1368 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1369 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1370 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1371 }
1372 }
1373 }
1374
1375 void
brw_ELSE(struct brw_compile * p)1376 brw_ELSE(struct brw_compile *p)
1377 {
1378 struct intel_context *intel = &p->brw->intel;
1379 struct brw_instruction *insn;
1380
1381 insn = next_insn(p, BRW_OPCODE_ELSE);
1382
1383 if (intel->gen < 6) {
1384 brw_set_dest(p, insn, brw_ip_reg());
1385 brw_set_src0(p, insn, brw_ip_reg());
1386 brw_set_src1(p, insn, brw_imm_d(0x0));
1387 } else if (intel->gen == 6) {
1388 brw_set_dest(p, insn, brw_imm_w(0));
1389 insn->bits1.branch_gen6.jump_count = 0;
1390 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1391 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1392 } else {
1393 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1394 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1395 brw_set_src1(p, insn, brw_imm_ud(0));
1396 insn->bits3.break_cont.jip = 0;
1397 insn->bits3.break_cont.uip = 0;
1398 }
1399
1400 insn->header.compression_control = BRW_COMPRESSION_NONE;
1401 insn->header.mask_control = BRW_MASK_ENABLE;
1402 if (!p->single_program_flow)
1403 insn->header.thread_control = BRW_THREAD_SWITCH;
1404
1405 push_if_stack(p, insn);
1406 }
1407
1408 void
brw_ENDIF(struct brw_compile * p)1409 brw_ENDIF(struct brw_compile *p)
1410 {
1411 struct intel_context *intel = &p->brw->intel;
1412 struct brw_instruction *insn = NULL;
1413 struct brw_instruction *else_inst = NULL;
1414 struct brw_instruction *if_inst = NULL;
1415 struct brw_instruction *tmp;
1416 bool emit_endif = true;
1417
1418 /* In single program flow mode, we can express IF and ELSE instructions
1419 * equivalently as ADD instructions that operate on IP. On platforms prior
1420 * to Gen6, flow control instructions cause an implied thread switch, so
1421 * this is a significant savings.
1422 *
1423 * However, on Gen6, writing to IP doesn't work in single program flow mode
1424 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1425 * not be updated by non-flow control instructions."). And on later
1426 * platforms, there is no significant benefit to converting control flow
1427 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1428 * Gen5.
1429 */
1430 if (intel->gen < 6 && p->single_program_flow)
1431 emit_endif = false;
1432
1433 /*
1434 * A single next_insn() may change the base adress of instruction store
1435 * memory(p->store), so call it first before referencing the instruction
1436 * store pointer from an index
1437 */
1438 if (emit_endif)
1439 insn = next_insn(p, BRW_OPCODE_ENDIF);
1440
1441 /* Pop the IF and (optional) ELSE instructions from the stack */
1442 p->if_depth_in_loop[p->loop_stack_depth]--;
1443 tmp = pop_if_stack(p);
1444 if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1445 else_inst = tmp;
1446 tmp = pop_if_stack(p);
1447 }
1448 if_inst = tmp;
1449
1450 if (!emit_endif) {
1451 /* ENDIF is useless; don't bother emitting it. */
1452 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1453 return;
1454 }
1455
1456 if (intel->gen < 6) {
1457 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1458 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1459 brw_set_src1(p, insn, brw_imm_d(0x0));
1460 } else if (intel->gen == 6) {
1461 brw_set_dest(p, insn, brw_imm_w(0));
1462 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1463 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1464 } else {
1465 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1466 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1467 brw_set_src1(p, insn, brw_imm_ud(0));
1468 }
1469
1470 insn->header.compression_control = BRW_COMPRESSION_NONE;
1471 insn->header.mask_control = BRW_MASK_ENABLE;
1472 insn->header.thread_control = BRW_THREAD_SWITCH;
1473
1474 /* Also pop item off the stack in the endif instruction: */
1475 if (intel->gen < 6) {
1476 insn->bits3.if_else.jump_count = 0;
1477 insn->bits3.if_else.pop_count = 1;
1478 insn->bits3.if_else.pad0 = 0;
1479 } else if (intel->gen == 6) {
1480 insn->bits1.branch_gen6.jump_count = 2;
1481 } else {
1482 insn->bits3.break_cont.jip = 2;
1483 }
1484 patch_IF_ELSE(p, if_inst, else_inst, insn);
1485 }
1486
brw_BREAK(struct brw_compile * p)1487 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1488 {
1489 struct intel_context *intel = &p->brw->intel;
1490 struct brw_instruction *insn;
1491
1492 insn = next_insn(p, BRW_OPCODE_BREAK);
1493 if (intel->gen >= 6) {
1494 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1495 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1496 brw_set_src1(p, insn, brw_imm_d(0x0));
1497 } else {
1498 brw_set_dest(p, insn, brw_ip_reg());
1499 brw_set_src0(p, insn, brw_ip_reg());
1500 brw_set_src1(p, insn, brw_imm_d(0x0));
1501 insn->bits3.if_else.pad0 = 0;
1502 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1503 }
1504 insn->header.compression_control = BRW_COMPRESSION_NONE;
1505 insn->header.execution_size = BRW_EXECUTE_8;
1506
1507 return insn;
1508 }
1509
gen6_CONT(struct brw_compile * p)1510 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1511 {
1512 struct brw_instruction *insn;
1513
1514 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1515 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1516 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1517 brw_set_dest(p, insn, brw_ip_reg());
1518 brw_set_src0(p, insn, brw_ip_reg());
1519 brw_set_src1(p, insn, brw_imm_d(0x0));
1520
1521 insn->header.compression_control = BRW_COMPRESSION_NONE;
1522 insn->header.execution_size = BRW_EXECUTE_8;
1523 return insn;
1524 }
1525
brw_CONT(struct brw_compile * p)1526 struct brw_instruction *brw_CONT(struct brw_compile *p)
1527 {
1528 struct brw_instruction *insn;
1529 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1530 brw_set_dest(p, insn, brw_ip_reg());
1531 brw_set_src0(p, insn, brw_ip_reg());
1532 brw_set_src1(p, insn, brw_imm_d(0x0));
1533 insn->header.compression_control = BRW_COMPRESSION_NONE;
1534 insn->header.execution_size = BRW_EXECUTE_8;
1535 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1536 insn->bits3.if_else.pad0 = 0;
1537 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1538 return insn;
1539 }
1540
gen6_HALT(struct brw_compile * p)1541 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1542 {
1543 struct brw_instruction *insn;
1544
1545 insn = next_insn(p, BRW_OPCODE_HALT);
1546 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1547 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1548 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1549
1550 if (p->compressed) {
1551 insn->header.execution_size = BRW_EXECUTE_16;
1552 } else {
1553 insn->header.compression_control = BRW_COMPRESSION_NONE;
1554 insn->header.execution_size = BRW_EXECUTE_8;
1555 }
1556 return insn;
1557 }
1558
1559 /* DO/WHILE loop:
1560 *
1561 * The DO/WHILE is just an unterminated loop -- break or continue are
1562 * used for control within the loop. We have a few ways they can be
1563 * done.
1564 *
1565 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1566 * jip and no DO instruction.
1567 *
1568 * For non-uniform control flow pre-gen6, there's a DO instruction to
1569 * push the mask, and a WHILE to jump back, and BREAK to get out and
1570 * pop the mask.
1571 *
1572 * For gen6, there's no more mask stack, so no need for DO. WHILE
1573 * just points back to the first instruction of the loop.
1574 */
brw_DO(struct brw_compile * p,unsigned execute_size)1575 struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1576 {
1577 struct intel_context *intel = &p->brw->intel;
1578
1579 if (intel->gen >= 6 || p->single_program_flow) {
1580 push_loop_stack(p, &p->store[p->nr_insn]);
1581 return &p->store[p->nr_insn];
1582 } else {
1583 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1584
1585 push_loop_stack(p, insn);
1586
1587 /* Override the defaults for this instruction:
1588 */
1589 brw_set_dest(p, insn, brw_null_reg());
1590 brw_set_src0(p, insn, brw_null_reg());
1591 brw_set_src1(p, insn, brw_null_reg());
1592
1593 insn->header.compression_control = BRW_COMPRESSION_NONE;
1594 insn->header.execution_size = execute_size;
1595 insn->header.predicate_control = BRW_PREDICATE_NONE;
1596 /* insn->header.mask_control = BRW_MASK_ENABLE; */
1597 /* insn->header.mask_control = BRW_MASK_DISABLE; */
1598
1599 return insn;
1600 }
1601 }
1602
1603 /**
1604 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1605 * instruction here.
1606 *
1607 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1608 * nesting, since it can always just point to the end of the block/current loop.
1609 */
1610 static void
brw_patch_break_cont(struct brw_compile * p,struct brw_instruction * while_inst)1611 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1612 {
1613 struct intel_context *intel = &p->brw->intel;
1614 struct brw_instruction *do_inst = get_inner_do_insn(p);
1615 struct brw_instruction *inst;
1616 int br = (intel->gen == 5) ? 2 : 1;
1617
1618 for (inst = while_inst - 1; inst != do_inst; inst--) {
1619 /* If the jump count is != 0, that means that this instruction has already
1620 * been patched because it's part of a loop inside of the one we're
1621 * patching.
1622 */
1623 if (inst->header.opcode == BRW_OPCODE_BREAK &&
1624 inst->bits3.if_else.jump_count == 0) {
1625 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1626 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1627 inst->bits3.if_else.jump_count == 0) {
1628 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1629 }
1630 }
1631 }
1632
brw_WHILE(struct brw_compile * p)1633 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1634 {
1635 struct intel_context *intel = &p->brw->intel;
1636 struct brw_instruction *insn, *do_insn;
1637 unsigned br = 1;
1638
1639 if (intel->gen >= 5)
1640 br = 2;
1641
1642 if (intel->gen >= 7) {
1643 insn = next_insn(p, BRW_OPCODE_WHILE);
1644 do_insn = get_inner_do_insn(p);
1645
1646 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1647 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1648 brw_set_src1(p, insn, brw_imm_ud(0));
1649 insn->bits3.break_cont.jip = br * (do_insn - insn);
1650
1651 insn->header.execution_size = BRW_EXECUTE_8;
1652 } else if (intel->gen == 6) {
1653 insn = next_insn(p, BRW_OPCODE_WHILE);
1654 do_insn = get_inner_do_insn(p);
1655
1656 brw_set_dest(p, insn, brw_imm_w(0));
1657 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1658 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1659 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1660
1661 insn->header.execution_size = BRW_EXECUTE_8;
1662 } else {
1663 if (p->single_program_flow) {
1664 insn = next_insn(p, BRW_OPCODE_ADD);
1665 do_insn = get_inner_do_insn(p);
1666
1667 brw_set_dest(p, insn, brw_ip_reg());
1668 brw_set_src0(p, insn, brw_ip_reg());
1669 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1670 insn->header.execution_size = BRW_EXECUTE_1;
1671 } else {
1672 insn = next_insn(p, BRW_OPCODE_WHILE);
1673 do_insn = get_inner_do_insn(p);
1674
1675 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1676
1677 brw_set_dest(p, insn, brw_ip_reg());
1678 brw_set_src0(p, insn, brw_ip_reg());
1679 brw_set_src1(p, insn, brw_imm_d(0));
1680
1681 insn->header.execution_size = do_insn->header.execution_size;
1682 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1683 insn->bits3.if_else.pop_count = 0;
1684 insn->bits3.if_else.pad0 = 0;
1685
1686 brw_patch_break_cont(p, insn);
1687 }
1688 }
1689 insn->header.compression_control = BRW_COMPRESSION_NONE;
1690 p->current->header.predicate_control = BRW_PREDICATE_NONE;
1691
1692 p->loop_stack_depth--;
1693
1694 return insn;
1695 }
1696
1697
1698 /* FORWARD JUMPS:
1699 */
brw_land_fwd_jump(struct brw_compile * p,int jmp_insn_idx)1700 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1701 {
1702 struct intel_context *intel = &p->brw->intel;
1703 struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1704 unsigned jmpi = 1;
1705
1706 if (intel->gen >= 5)
1707 jmpi = 2;
1708
1709 assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1710 assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1711
1712 jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1713 }
1714
1715
1716
1717 /* To integrate with the above, it makes sense that the comparison
1718 * instruction should populate the flag register. It might be simpler
1719 * just to use the flag reg for most WM tasks?
1720 */
brw_CMP(struct brw_compile * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1721 void brw_CMP(struct brw_compile *p,
1722 struct brw_reg dest,
1723 unsigned conditional,
1724 struct brw_reg src0,
1725 struct brw_reg src1)
1726 {
1727 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1728
1729 insn->header.destreg__conditionalmod = conditional;
1730 brw_set_dest(p, insn, dest);
1731 brw_set_src0(p, insn, src0);
1732 brw_set_src1(p, insn, src1);
1733
1734 /* guess_execution_size(insn, src0); */
1735
1736
1737 /* Make it so that future instructions will use the computed flag
1738 * value until brw_set_predicate_control_flag_value() is called
1739 * again.
1740 */
1741 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1742 dest.nr == 0) {
1743 p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1744 p->flag_value = 0xff;
1745 }
1746 }
1747
1748 /* Issue 'wait' instruction for n1, host could program MMIO
1749 to wake up thread. */
brw_WAIT(struct brw_compile * p)1750 void brw_WAIT (struct brw_compile *p)
1751 {
1752 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1753 struct brw_reg src = brw_notification_1_reg();
1754
1755 brw_set_dest(p, insn, src);
1756 brw_set_src0(p, insn, src);
1757 brw_set_src1(p, insn, brw_null_reg());
1758 insn->header.execution_size = 0; /* must */
1759 insn->header.predicate_control = 0;
1760 insn->header.compression_control = 0;
1761 }
1762
1763
1764 /***********************************************************************
1765 * Helpers for the various SEND message types:
1766 */
1767
1768 /** Extended math function, float[8].
1769 */
brw_math(struct brw_compile * p,struct brw_reg dest,unsigned function,unsigned msg_reg_nr,struct brw_reg src,unsigned data_type,unsigned precision)1770 void brw_math( struct brw_compile *p,
1771 struct brw_reg dest,
1772 unsigned function,
1773 unsigned msg_reg_nr,
1774 struct brw_reg src,
1775 unsigned data_type,
1776 unsigned precision )
1777 {
1778 struct intel_context *intel = &p->brw->intel;
1779
1780 if (intel->gen >= 6) {
1781 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1782
1783 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1784 assert(src.file == BRW_GENERAL_REGISTER_FILE);
1785
1786 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1787 if (intel->gen == 6)
1788 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1789
1790 /* Source modifiers are ignored for extended math instructions on Gen6. */
1791 if (intel->gen == 6) {
1792 assert(!src.negate);
1793 assert(!src.abs);
1794 }
1795
1796 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1797 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1798 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1799 assert(src.type != BRW_REGISTER_TYPE_F);
1800 } else {
1801 assert(src.type == BRW_REGISTER_TYPE_F);
1802 }
1803
1804 /* Math is the same ISA format as other opcodes, except that CondModifier
1805 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1806 */
1807 insn->header.destreg__conditionalmod = function;
1808
1809 brw_set_dest(p, insn, dest);
1810 brw_set_src0(p, insn, src);
1811 brw_set_src1(p, insn, brw_null_reg());
1812 } else {
1813 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1814
1815 /* Example code doesn't set predicate_control for send
1816 * instructions.
1817 */
1818 insn->header.predicate_control = 0;
1819 insn->header.destreg__conditionalmod = msg_reg_nr;
1820
1821 brw_set_dest(p, insn, dest);
1822 brw_set_src0(p, insn, src);
1823 brw_set_math_message(p,
1824 insn,
1825 function,
1826 src.type == BRW_REGISTER_TYPE_D,
1827 precision,
1828 data_type);
1829 }
1830 }
1831
1832 /** Extended math function, float[8].
1833 */
brw_math2(struct brw_compile * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)1834 void brw_math2(struct brw_compile *p,
1835 struct brw_reg dest,
1836 unsigned function,
1837 struct brw_reg src0,
1838 struct brw_reg src1)
1839 {
1840 struct intel_context *intel = &p->brw->intel;
1841 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1842
1843 assert(intel->gen >= 6);
1844 (void) intel;
1845
1846
1847 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1848 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1849 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1850
1851 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1852 if (intel->gen == 6) {
1853 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1854 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1855 }
1856
1857 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1858 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1859 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1860 assert(src0.type != BRW_REGISTER_TYPE_F);
1861 assert(src1.type != BRW_REGISTER_TYPE_F);
1862 } else {
1863 assert(src0.type == BRW_REGISTER_TYPE_F);
1864 assert(src1.type == BRW_REGISTER_TYPE_F);
1865 }
1866
1867 /* Source modifiers are ignored for extended math instructions on Gen6. */
1868 if (intel->gen == 6) {
1869 assert(!src0.negate);
1870 assert(!src0.abs);
1871 assert(!src1.negate);
1872 assert(!src1.abs);
1873 }
1874
1875 /* Math is the same ISA format as other opcodes, except that CondModifier
1876 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1877 */
1878 insn->header.destreg__conditionalmod = function;
1879
1880 brw_set_dest(p, insn, dest);
1881 brw_set_src0(p, insn, src0);
1882 brw_set_src1(p, insn, src1);
1883 }
1884
1885
1886 /**
1887 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1888 * using a constant offset per channel.
1889 *
1890 * The offset must be aligned to oword size (16 bytes). Used for
1891 * register spilling.
1892 */
brw_oword_block_write_scratch(struct brw_compile * p,struct brw_reg mrf,int num_regs,unsigned offset)1893 void brw_oword_block_write_scratch(struct brw_compile *p,
1894 struct brw_reg mrf,
1895 int num_regs,
1896 unsigned offset)
1897 {
1898 struct intel_context *intel = &p->brw->intel;
1899 uint32_t msg_control, msg_type;
1900 int mlen;
1901
1902 if (intel->gen >= 6)
1903 offset /= 16;
1904
1905 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1906
1907 if (num_regs == 1) {
1908 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1909 mlen = 2;
1910 } else {
1911 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1912 mlen = 3;
1913 }
1914
1915 /* Set up the message header. This is g0, with g0.2 filled with
1916 * the offset. We don't want to leave our offset around in g0 or
1917 * it'll screw up texture samples, so set it up inside the message
1918 * reg.
1919 */
1920 {
1921 brw_push_insn_state(p);
1922 brw_set_mask_control(p, BRW_MASK_DISABLE);
1923 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1924
1925 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1926
1927 /* set message header global offset field (reg 0, element 2) */
1928 brw_MOV(p,
1929 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1930 mrf.nr,
1931 2), BRW_REGISTER_TYPE_UD),
1932 brw_imm_ud(offset));
1933
1934 brw_pop_insn_state(p);
1935 }
1936
1937 {
1938 struct brw_reg dest;
1939 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1940 int send_commit_msg;
1941 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1942 BRW_REGISTER_TYPE_UW);
1943
1944 if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1945 insn->header.compression_control = BRW_COMPRESSION_NONE;
1946 src_header = vec16(src_header);
1947 }
1948 assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1949 insn->header.destreg__conditionalmod = mrf.nr;
1950
1951 /* Until gen6, writes followed by reads from the same location
1952 * are not guaranteed to be ordered unless write_commit is set.
1953 * If set, then a no-op write is issued to the destination
1954 * register to set a dependency, and a read from the destination
1955 * can be used to ensure the ordering.
1956 *
1957 * For gen6, only writes between different threads need ordering
1958 * protection. Our use of DP writes is all about register
1959 * spilling within a thread.
1960 */
1961 if (intel->gen >= 6) {
1962 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1963 send_commit_msg = 0;
1964 } else {
1965 dest = src_header;
1966 send_commit_msg = 1;
1967 }
1968
1969 brw_set_dest(p, insn, dest);
1970 if (intel->gen >= 6) {
1971 brw_set_src0(p, insn, mrf);
1972 } else {
1973 brw_set_src0(p, insn, brw_null_reg());
1974 }
1975
1976 if (intel->gen >= 6)
1977 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1978 else
1979 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1980
1981 brw_set_dp_write_message(p,
1982 insn,
1983 255, /* binding table index (255=stateless) */
1984 msg_control,
1985 msg_type,
1986 mlen,
1987 true, /* header_present */
1988 0, /* not a render target */
1989 send_commit_msg, /* response_length */
1990 0, /* eot */
1991 send_commit_msg);
1992 }
1993 }
1994
1995
1996 /**
1997 * Read a block of owords (half a GRF each) from the scratch buffer
1998 * using a constant index per channel.
1999 *
2000 * Offset must be aligned to oword size (16 bytes). Used for register
2001 * spilling.
2002 */
2003 void
brw_oword_block_read_scratch(struct brw_compile * p,struct brw_reg dest,struct brw_reg mrf,int num_regs,unsigned offset)2004 brw_oword_block_read_scratch(struct brw_compile *p,
2005 struct brw_reg dest,
2006 struct brw_reg mrf,
2007 int num_regs,
2008 unsigned offset)
2009 {
2010 struct intel_context *intel = &p->brw->intel;
2011 uint32_t msg_control;
2012 int rlen;
2013
2014 if (intel->gen >= 6)
2015 offset /= 16;
2016
2017 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2018 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2019
2020 if (num_regs == 1) {
2021 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2022 rlen = 1;
2023 } else {
2024 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2025 rlen = 2;
2026 }
2027
2028 {
2029 brw_push_insn_state(p);
2030 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2031 brw_set_mask_control(p, BRW_MASK_DISABLE);
2032
2033 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2034
2035 /* set message header global offset field (reg 0, element 2) */
2036 brw_MOV(p,
2037 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2038 mrf.nr,
2039 2), BRW_REGISTER_TYPE_UD),
2040 brw_imm_ud(offset));
2041
2042 brw_pop_insn_state(p);
2043 }
2044
2045 {
2046 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2047
2048 assert(insn->header.predicate_control == 0);
2049 insn->header.compression_control = BRW_COMPRESSION_NONE;
2050 insn->header.destreg__conditionalmod = mrf.nr;
2051
2052 brw_set_dest(p, insn, dest); /* UW? */
2053 if (intel->gen >= 6) {
2054 brw_set_src0(p, insn, mrf);
2055 } else {
2056 brw_set_src0(p, insn, brw_null_reg());
2057 }
2058
2059 brw_set_dp_read_message(p,
2060 insn,
2061 255, /* binding table index (255=stateless) */
2062 msg_control,
2063 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2064 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2065 1, /* msg_length */
2066 true, /* header_present */
2067 rlen);
2068 }
2069 }
2070
2071 /**
2072 * Read a float[4] vector from the data port Data Cache (const buffer).
2073 * Location (in buffer) should be a multiple of 16.
2074 * Used for fetching shader constants.
2075 */
brw_oword_block_read(struct brw_compile * p,struct brw_reg dest,struct brw_reg mrf,uint32_t offset,uint32_t bind_table_index)2076 void brw_oword_block_read(struct brw_compile *p,
2077 struct brw_reg dest,
2078 struct brw_reg mrf,
2079 uint32_t offset,
2080 uint32_t bind_table_index)
2081 {
2082 struct intel_context *intel = &p->brw->intel;
2083
2084 /* On newer hardware, offset is in units of owords. */
2085 if (intel->gen >= 6)
2086 offset /= 16;
2087
2088 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2089
2090 brw_push_insn_state(p);
2091 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2092 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2093 brw_set_mask_control(p, BRW_MASK_DISABLE);
2094
2095 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2096
2097 /* set message header global offset field (reg 0, element 2) */
2098 brw_MOV(p,
2099 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2100 mrf.nr,
2101 2), BRW_REGISTER_TYPE_UD),
2102 brw_imm_ud(offset));
2103
2104 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2105 insn->header.destreg__conditionalmod = mrf.nr;
2106
2107 /* cast dest to a uword[8] vector */
2108 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2109
2110 brw_set_dest(p, insn, dest);
2111 if (intel->gen >= 6) {
2112 brw_set_src0(p, insn, mrf);
2113 } else {
2114 brw_set_src0(p, insn, brw_null_reg());
2115 }
2116
2117 brw_set_dp_read_message(p,
2118 insn,
2119 bind_table_index,
2120 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2121 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2122 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2123 1, /* msg_length */
2124 true, /* header_present */
2125 1); /* response_length (1 reg, 2 owords!) */
2126
2127 brw_pop_insn_state(p);
2128 }
2129
2130
brw_fb_WRITE(struct brw_compile * p,int dispatch_width,unsigned msg_reg_nr,struct brw_reg src0,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool header_present)2131 void brw_fb_WRITE(struct brw_compile *p,
2132 int dispatch_width,
2133 unsigned msg_reg_nr,
2134 struct brw_reg src0,
2135 unsigned msg_control,
2136 unsigned binding_table_index,
2137 unsigned msg_length,
2138 unsigned response_length,
2139 bool eot,
2140 bool header_present)
2141 {
2142 struct intel_context *intel = &p->brw->intel;
2143 struct brw_instruction *insn;
2144 unsigned msg_type;
2145 struct brw_reg dest;
2146
2147 if (dispatch_width == 16)
2148 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2149 else
2150 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2151
2152 if (intel->gen >= 6) {
2153 insn = next_insn(p, BRW_OPCODE_SENDC);
2154 } else {
2155 insn = next_insn(p, BRW_OPCODE_SEND);
2156 }
2157 /* The execution mask is ignored for render target writes. */
2158 insn->header.predicate_control = 0;
2159 insn->header.compression_control = BRW_COMPRESSION_NONE;
2160
2161 if (intel->gen >= 6) {
2162 /* headerless version, just submit color payload */
2163 src0 = brw_message_reg(msg_reg_nr);
2164
2165 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2166 } else {
2167 insn->header.destreg__conditionalmod = msg_reg_nr;
2168
2169 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2170 }
2171
2172 brw_set_dest(p, insn, dest);
2173 brw_set_src0(p, insn, src0);
2174 brw_set_dp_write_message(p,
2175 insn,
2176 binding_table_index,
2177 msg_control,
2178 msg_type,
2179 msg_length,
2180 header_present,
2181 eot, /* last render target write */
2182 response_length,
2183 eot,
2184 0 /* send_commit_msg */);
2185 }
2186
2187
2188 /**
2189 * Texture sample instruction.
2190 * Note: the msg_type plus msg_length values determine exactly what kind
2191 * of sampling operation is performed. See volume 4, page 161 of docs.
2192 */
brw_SAMPLE(struct brw_compile * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,unsigned sampler,unsigned writemask,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2193 void brw_SAMPLE(struct brw_compile *p,
2194 struct brw_reg dest,
2195 unsigned msg_reg_nr,
2196 struct brw_reg src0,
2197 unsigned binding_table_index,
2198 unsigned sampler,
2199 unsigned writemask,
2200 unsigned msg_type,
2201 unsigned response_length,
2202 unsigned msg_length,
2203 unsigned header_present,
2204 unsigned simd_mode,
2205 unsigned return_format)
2206 {
2207 struct intel_context *intel = &p->brw->intel;
2208 bool need_stall = 0;
2209
2210 if (writemask == 0) {
2211 /*printf("%s: zero writemask??\n", __FUNCTION__); */
2212 return;
2213 }
2214
2215 /* Hardware doesn't do destination dependency checking on send
2216 * instructions properly. Add a workaround which generates the
2217 * dependency by other means. In practice it seems like this bug
2218 * only crops up for texture samples, and only where registers are
2219 * written by the send and then written again later without being
2220 * read in between. Luckily for us, we already track that
2221 * information and use it to modify the writemask for the
2222 * instruction, so that is a guide for whether a workaround is
2223 * needed.
2224 */
2225 if (writemask != BRW_WRITEMASK_XYZW) {
2226 unsigned dst_offset = 0;
2227 unsigned i, newmask = 0, len = 0;
2228
2229 for (i = 0; i < 4; i++) {
2230 if (writemask & (1<<i))
2231 break;
2232 dst_offset += 2;
2233 }
2234 for (; i < 4; i++) {
2235 if (!(writemask & (1<<i)))
2236 break;
2237 newmask |= 1<<i;
2238 len++;
2239 }
2240
2241 if (newmask != writemask) {
2242 need_stall = 1;
2243 /* printf("need stall %x %x\n", newmask , writemask); */
2244 }
2245 else {
2246 bool dispatch_16 = false;
2247
2248 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2249
2250 guess_execution_size(p, p->current, dest);
2251 if (p->current->header.execution_size == BRW_EXECUTE_16)
2252 dispatch_16 = true;
2253
2254 newmask = ~newmask & BRW_WRITEMASK_XYZW;
2255
2256 brw_push_insn_state(p);
2257
2258 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2259 brw_set_mask_control(p, BRW_MASK_DISABLE);
2260
2261 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2262 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2263 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2264
2265 brw_pop_insn_state(p);
2266
2267 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2268 dest = offset(dest, dst_offset);
2269
2270 /* For 16-wide dispatch, masked channels are skipped in the
2271 * response. For 8-wide, masked channels still take up slots,
2272 * and are just not written to.
2273 */
2274 if (dispatch_16)
2275 response_length = len * 2;
2276 }
2277 }
2278
2279 {
2280 struct brw_instruction *insn;
2281
2282 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2283
2284 insn = next_insn(p, BRW_OPCODE_SEND);
2285 insn->header.predicate_control = 0; /* XXX */
2286 insn->header.compression_control = BRW_COMPRESSION_NONE;
2287 if (intel->gen < 6)
2288 insn->header.destreg__conditionalmod = msg_reg_nr;
2289
2290 brw_set_dest(p, insn, dest);
2291 brw_set_src0(p, insn, src0);
2292 brw_set_sampler_message(p, insn,
2293 binding_table_index,
2294 sampler,
2295 msg_type,
2296 response_length,
2297 msg_length,
2298 header_present,
2299 simd_mode,
2300 return_format);
2301 }
2302
2303 if (need_stall) {
2304 struct brw_reg reg = vec8(offset(dest, response_length-1));
2305
2306 /* mov (8) r9.0<1>:f r9.0<8;8,1>:f { Align1 }
2307 */
2308 brw_push_insn_state(p);
2309 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2310 brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2311 retype(reg, BRW_REGISTER_TYPE_UD));
2312 brw_pop_insn_state(p);
2313 }
2314
2315 }
2316
2317 /* All these variables are pretty confusing - we might be better off
2318 * using bitmasks and macros for this, in the old style. Or perhaps
2319 * just having the caller instantiate the fields in dword3 itself.
2320 */
brw_urb_WRITE(struct brw_compile * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,bool used,unsigned msg_length,unsigned response_length,bool eot,bool writes_complete,unsigned offset,unsigned swizzle)2321 void brw_urb_WRITE(struct brw_compile *p,
2322 struct brw_reg dest,
2323 unsigned msg_reg_nr,
2324 struct brw_reg src0,
2325 bool allocate,
2326 bool used,
2327 unsigned msg_length,
2328 unsigned response_length,
2329 bool eot,
2330 bool writes_complete,
2331 unsigned offset,
2332 unsigned swizzle)
2333 {
2334 struct intel_context *intel = &p->brw->intel;
2335 struct brw_instruction *insn;
2336
2337 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2338
2339 if (intel->gen == 7) {
2340 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2341 brw_push_insn_state(p);
2342 brw_set_access_mode(p, BRW_ALIGN_1);
2343 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2344 BRW_REGISTER_TYPE_UD),
2345 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2346 brw_imm_ud(0xff00));
2347 brw_pop_insn_state(p);
2348 }
2349
2350 insn = next_insn(p, BRW_OPCODE_SEND);
2351
2352 assert(msg_length < BRW_MAX_MRF);
2353
2354 brw_set_dest(p, insn, dest);
2355 brw_set_src0(p, insn, src0);
2356 brw_set_src1(p, insn, brw_imm_d(0));
2357
2358 if (intel->gen < 6)
2359 insn->header.destreg__conditionalmod = msg_reg_nr;
2360
2361 brw_set_urb_message(p,
2362 insn,
2363 allocate,
2364 used,
2365 msg_length,
2366 response_length,
2367 eot,
2368 writes_complete,
2369 offset,
2370 swizzle);
2371 }
2372
2373 static int
next_ip(struct brw_compile * p,int ip)2374 next_ip(struct brw_compile *p, int ip)
2375 {
2376 struct brw_instruction *insn = (void *)p->store + ip;
2377
2378 if (insn->header.cmpt_control)
2379 return ip + 8;
2380 else
2381 return ip + 16;
2382 }
2383
2384 static int
brw_find_next_block_end(struct brw_compile * p,int start)2385 brw_find_next_block_end(struct brw_compile *p, int start)
2386 {
2387 int ip;
2388 void *store = p->store;
2389
2390 for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2391 struct brw_instruction *insn = store + ip;
2392
2393 switch (insn->header.opcode) {
2394 case BRW_OPCODE_ENDIF:
2395 case BRW_OPCODE_ELSE:
2396 case BRW_OPCODE_WHILE:
2397 case BRW_OPCODE_HALT:
2398 return ip;
2399 }
2400 }
2401
2402 return 0;
2403 }
2404
2405 /* There is no DO instruction on gen6, so to find the end of the loop
2406 * we have to see if the loop is jumping back before our start
2407 * instruction.
2408 */
2409 static int
brw_find_loop_end(struct brw_compile * p,int start)2410 brw_find_loop_end(struct brw_compile *p, int start)
2411 {
2412 struct intel_context *intel = &p->brw->intel;
2413 int ip;
2414 int scale = 8;
2415 void *store = p->store;
2416
2417 /* Always start after the instruction (such as a WHILE) we're trying to fix
2418 * up.
2419 */
2420 for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2421 struct brw_instruction *insn = store + ip;
2422
2423 if (insn->header.opcode == BRW_OPCODE_WHILE) {
2424 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2425 : insn->bits3.break_cont.jip;
2426 if (ip + jip * scale <= start)
2427 return ip;
2428 }
2429 }
2430 assert(!"not reached");
2431 return start;
2432 }
2433
2434 /* After program generation, go back and update the UIP and JIP of
2435 * BREAK, CONT, and HALT instructions to their correct locations.
2436 */
2437 void
brw_set_uip_jip(struct brw_compile * p)2438 brw_set_uip_jip(struct brw_compile *p)
2439 {
2440 struct intel_context *intel = &p->brw->intel;
2441 int ip;
2442 int scale = 8;
2443 void *store = p->store;
2444
2445 if (intel->gen < 6)
2446 return;
2447
2448 for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2449 struct brw_instruction *insn = store + ip;
2450
2451 if (insn->header.cmpt_control) {
2452 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2453 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2454 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2455 insn->header.opcode != BRW_OPCODE_HALT);
2456 continue;
2457 }
2458
2459 int block_end_ip = brw_find_next_block_end(p, ip);
2460 switch (insn->header.opcode) {
2461 case BRW_OPCODE_BREAK:
2462 assert(block_end_ip != 0);
2463 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2464 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2465 insn->bits3.break_cont.uip =
2466 (brw_find_loop_end(p, ip) - ip +
2467 (intel->gen == 6 ? 16 : 0)) / scale;
2468 break;
2469 case BRW_OPCODE_CONTINUE:
2470 assert(block_end_ip != 0);
2471 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2472 insn->bits3.break_cont.uip =
2473 (brw_find_loop_end(p, ip) - ip) / scale;
2474
2475 assert(insn->bits3.break_cont.uip != 0);
2476 assert(insn->bits3.break_cont.jip != 0);
2477 break;
2478
2479 case BRW_OPCODE_ENDIF:
2480 if (block_end_ip == 0)
2481 insn->bits3.break_cont.jip = 2;
2482 else
2483 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2484 break;
2485
2486 case BRW_OPCODE_HALT:
2487 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2488 *
2489 * "In case of the halt instruction not inside any conditional
2490 * code block, the value of <JIP> and <UIP> should be the
2491 * same. In case of the halt instruction inside conditional code
2492 * block, the <UIP> should be the end of the program, and the
2493 * <JIP> should be end of the most inner conditional code block."
2494 *
2495 * The uip will have already been set by whoever set up the
2496 * instruction.
2497 */
2498 if (block_end_ip == 0) {
2499 insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2500 } else {
2501 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2502 }
2503 assert(insn->bits3.break_cont.uip != 0);
2504 assert(insn->bits3.break_cont.jip != 0);
2505 break;
2506 }
2507 }
2508 }
2509
brw_ff_sync(struct brw_compile * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,unsigned response_length,bool eot)2510 void brw_ff_sync(struct brw_compile *p,
2511 struct brw_reg dest,
2512 unsigned msg_reg_nr,
2513 struct brw_reg src0,
2514 bool allocate,
2515 unsigned response_length,
2516 bool eot)
2517 {
2518 struct intel_context *intel = &p->brw->intel;
2519 struct brw_instruction *insn;
2520
2521 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2522
2523 insn = next_insn(p, BRW_OPCODE_SEND);
2524 brw_set_dest(p, insn, dest);
2525 brw_set_src0(p, insn, src0);
2526 brw_set_src1(p, insn, brw_imm_d(0));
2527
2528 if (intel->gen < 6)
2529 insn->header.destreg__conditionalmod = msg_reg_nr;
2530
2531 brw_set_ff_sync_message(p,
2532 insn,
2533 allocate,
2534 response_length,
2535 eot);
2536 }
2537
2538 /**
2539 * Emit the SEND instruction necessary to generate stream output data on Gen6
2540 * (for transform feedback).
2541 *
2542 * If send_commit_msg is true, this is the last piece of stream output data
2543 * from this thread, so send the data as a committed write. According to the
2544 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2545 *
2546 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2547 * writes are complete by sending the final write as a committed write."
2548 */
2549 void
brw_svb_write(struct brw_compile * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,bool send_commit_msg)2550 brw_svb_write(struct brw_compile *p,
2551 struct brw_reg dest,
2552 unsigned msg_reg_nr,
2553 struct brw_reg src0,
2554 unsigned binding_table_index,
2555 bool send_commit_msg)
2556 {
2557 struct brw_instruction *insn;
2558
2559 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2560
2561 insn = next_insn(p, BRW_OPCODE_SEND);
2562 brw_set_dest(p, insn, dest);
2563 brw_set_src0(p, insn, src0);
2564 brw_set_src1(p, insn, brw_imm_d(0));
2565 brw_set_dp_write_message(p, insn,
2566 binding_table_index,
2567 0, /* msg_control: ignored */
2568 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2569 1, /* msg_length */
2570 true, /* header_present */
2571 0, /* last_render_target: ignored */
2572 send_commit_msg, /* response_length */
2573 0, /* end_of_thread */
2574 send_commit_msg); /* send_commit_msg */
2575 }
2576
2577 /**
2578 * This instruction is generated as a single-channel align1 instruction by
2579 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2580 *
2581 * We can't use the typed atomic op in the FS because that has the execution
2582 * mask ANDed with the pixel mask, but we just want to write the one dword for
2583 * all the pixels.
2584 *
2585 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2586 * one u32. So we use the same untyped atomic write message as the pixel
2587 * shader.
2588 *
2589 * The untyped atomic operation requires a BUFFER surface type with RAW
2590 * format, and is only accessible through the legacy DATA_CACHE dataport
2591 * messages.
2592 */
brw_shader_time_add(struct brw_compile * p,int base_mrf,uint32_t surf_index)2593 void brw_shader_time_add(struct brw_compile *p,
2594 int base_mrf,
2595 uint32_t surf_index)
2596 {
2597 struct intel_context *intel = &p->brw->intel;
2598 assert(intel->gen >= 7);
2599
2600 brw_push_insn_state(p);
2601 brw_set_access_mode(p, BRW_ALIGN_1);
2602 brw_set_mask_control(p, BRW_MASK_DISABLE);
2603 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2604 brw_pop_insn_state(p);
2605
2606 /* We use brw_vec1_reg and unmasked because we want to increment the given
2607 * offset only once.
2608 */
2609 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2610 BRW_ARF_NULL, 0));
2611 brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2612 base_mrf, 0));
2613
2614 bool header_present = false;
2615 bool eot = false;
2616 uint32_t mlen = 2; /* offset, value */
2617 uint32_t rlen = 0;
2618 brw_set_message_descriptor(p, send,
2619 GEN7_SFID_DATAPORT_DATA_CACHE,
2620 mlen, rlen, header_present, eot);
2621
2622 send->bits3.ud |= 6 << 14; /* untyped atomic op */
2623 send->bits3.ud |= 0 << 13; /* no return data */
2624 send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2625 send->bits3.ud |= BRW_AOP_ADD << 8;
2626 send->bits3.ud |= surf_index << 0;
2627 }
2628