1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "util/u_math.h"
28
29 namespace brw {
30
vec4_instruction(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->eot = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->exec_size = 8;
59 this->group = 0;
60 this->size_written = (dst.file == BAD_FILE ?
61 0 : this->exec_size * type_sz(dst.type));
62 this->annotation = NULL;
63 }
64
65 vec4_instruction *
emit(vec4_instruction * inst)66 vec4_visitor::emit(vec4_instruction *inst)
67 {
68 inst->ir = this->base_ir;
69 inst->annotation = this->current_annotation;
70
71 this->instructions.push_tail(inst);
72
73 return inst;
74 }
75
76 vec4_instruction *
emit_before(bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)77 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
78 vec4_instruction *new_inst)
79 {
80 new_inst->ir = inst->ir;
81 new_inst->annotation = inst->annotation;
82
83 inst->insert_before(block, new_inst);
84
85 return inst;
86 }
87
88 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)89 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
90 const src_reg &src1, const src_reg &src2)
91 {
92 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93 }
94
95
96 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
98 const src_reg &src1)
99 {
100 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101 }
102
103 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
105 {
106 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107 }
108
109 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst)110 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
111 {
112 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113 }
114
115 vec4_instruction *
emit(enum opcode opcode)116 vec4_visitor::emit(enum opcode opcode)
117 {
118 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119 }
120
121 #define ALU1(op) \
122 vec4_instruction * \
123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
124 { \
125 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
126 }
127
128 #define ALU2(op) \
129 vec4_instruction * \
130 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
131 const src_reg &src1) \
132 { \
133 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
134 src0, src1); \
135 }
136
137 #define ALU2_ACC(op) \
138 vec4_instruction * \
139 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
140 const src_reg &src1) \
141 { \
142 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
143 BRW_OPCODE_##op, dst, src0, src1); \
144 inst->writes_accumulator = true; \
145 return inst; \
146 }
147
148 #define ALU3(op) \
149 vec4_instruction * \
150 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
151 const src_reg &src1, const src_reg &src2) \
152 { \
153 assert(devinfo->ver >= 6); \
154 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
155 src0, src1, src2); \
156 }
157
158 ALU1(NOT)
ALU1(MOV)159 ALU1(MOV)
160 ALU1(FRC)
161 ALU1(RNDD)
162 ALU1(RNDE)
163 ALU1(RNDZ)
164 ALU1(F32TO16)
165 ALU1(F16TO32)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2_ACC(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(DP3)
173 ALU2(DP4)
174 ALU2(DPH)
175 ALU2(SHL)
176 ALU2(SHR)
177 ALU2(ASR)
178 ALU3(LRP)
179 ALU1(BFREV)
180 ALU3(BFE)
181 ALU2(BFI1)
182 ALU3(BFI2)
183 ALU1(FBH)
184 ALU1(FBL)
185 ALU1(CBIT)
186 ALU3(MAD)
187 ALU2_ACC(ADDC)
188 ALU2_ACC(SUBB)
189 ALU2(MAC)
190 ALU1(DIM)
191
192 /** Gfx4 predicated IF. */
193 vec4_instruction *
194 vec4_visitor::IF(enum brw_predicate predicate)
195 {
196 vec4_instruction *inst;
197
198 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
199 inst->predicate = predicate;
200
201 return inst;
202 }
203
204 /** Gfx6 IF with embedded comparison. */
205 vec4_instruction *
IF(src_reg src0,src_reg src1,enum brw_conditional_mod condition)206 vec4_visitor::IF(src_reg src0, src_reg src1,
207 enum brw_conditional_mod condition)
208 {
209 assert(devinfo->ver == 6);
210
211 vec4_instruction *inst;
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
217 src0, src1);
218 inst->conditional_mod = condition;
219
220 return inst;
221 }
222
223 /**
224 * CMP: Sets the low bit of the destination channels with the result
225 * of the comparison, while the upper bits are undefined, and updates
226 * the flag register with the packed 16 bits of the result.
227 */
228 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum brw_conditional_mod condition)229 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
230 enum brw_conditional_mod condition)
231 {
232 vec4_instruction *inst;
233
234 /* Take the instruction:
235 *
236 * CMP null<d> src0<f> src1<f>
237 *
238 * Original gfx4 does type conversion to the destination type before
239 * comparison, producing garbage results for floating point comparisons.
240 *
241 * The destination type doesn't matter on newer generations, so we set the
242 * type to match src0 so we can compact the instruction.
243 */
244 dst.type = src0.type;
245
246 resolve_ud_negate(&src0);
247 resolve_ud_negate(&src1);
248
249 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
250 inst->conditional_mod = condition;
251
252 return inst;
253 }
254
255 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)256 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
257 {
258 vec4_instruction *inst;
259
260 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_READ,
261 dst, index);
262 inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1;
263 inst->mlen = 2;
264
265 return inst;
266 }
267
268 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)269 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
270 const src_reg &index)
271 {
272 vec4_instruction *inst;
273
274 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_WRITE,
275 dst, src, index);
276 inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver);
277 inst->mlen = 3;
278
279 return inst;
280 }
281
282 src_reg
fix_3src_operand(const src_reg & src)283 vec4_visitor::fix_3src_operand(const src_reg &src)
284 {
285 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
286 * able to use vertical stride of zero to replicate the vec4 uniform, like
287 *
288 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
289 *
290 * But you can't, since vertical stride is always four in three-source
291 * instructions. Instead, insert a MOV instruction to do the replication so
292 * that the three-source instruction can consume it.
293 */
294
295 /* The MOV is only needed if the source is a uniform or immediate. */
296 if (src.file != UNIFORM && src.file != IMM)
297 return src;
298
299 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
300 return src;
301
302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303 expanded.type = src.type;
304 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
305 return src_reg(expanded);
306 }
307
308 src_reg
fix_math_operand(const src_reg & src)309 vec4_visitor::fix_math_operand(const src_reg &src)
310 {
311 if (devinfo->ver < 6 || src.file == BAD_FILE)
312 return src;
313
314 /* The gfx6 math instruction ignores the source modifiers --
315 * swizzle, abs, negate, and at least some parts of the register
316 * region description.
317 *
318 * Rather than trying to enumerate all these cases, *always* expand the
319 * operand to a temp GRF for gfx6.
320 *
321 * For gfx7, keep the operand as-is, except if immediate, which gfx7 still
322 * can't use.
323 */
324
325 if (devinfo->ver == 7 && src.file != IMM)
326 return src;
327
328 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
329 expanded.type = src.type;
330 emit(MOV(expanded, src));
331 return src_reg(expanded);
332 }
333
334 vec4_instruction *
emit_math(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)335 vec4_visitor::emit_math(enum opcode opcode,
336 const dst_reg &dst,
337 const src_reg &src0, const src_reg &src1)
338 {
339 vec4_instruction *math =
340 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
341
342 if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) {
343 /* MATH on Gfx6 must be align1, so we can't do writemasks. */
344 math->dst = dst_reg(this, glsl_type::vec4_type);
345 math->dst.type = dst.type;
346 math = emit(MOV(dst, src_reg(math->dst)));
347 } else if (devinfo->ver < 6) {
348 math->base_mrf = 1;
349 math->mlen = src1.file == BAD_FILE ? 1 : 2;
350 }
351
352 return math;
353 }
354
355 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)356 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
357 {
358 if (devinfo->ver < 7) {
359 unreachable("ir_unop_pack_half_2x16 should be lowered");
360 }
361
362 assert(dst.type == BRW_REGISTER_TYPE_UD);
363 assert(src0.type == BRW_REGISTER_TYPE_F);
364
365 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
366 *
367 * Because this instruction does not have a 16-bit floating-point type,
368 * the destination data type must be Word (W).
369 *
370 * The destination must be DWord-aligned and specify a horizontal stride
371 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
372 * each destination channel and the upper word is not modified.
373 *
374 * The above restriction implies that the f32to16 instruction must use
375 * align1 mode, because only in align1 mode is it possible to specify
376 * horizontal stride. We choose here to defy the hardware docs and emit
377 * align16 instructions.
378 *
379 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
380 * instructions. I was partially successful in that the code passed all
381 * tests. However, the code was dubiously correct and fragile, and the
382 * tests were not harsh enough to probe that frailty. Not trusting the
383 * code, I chose instead to remain in align16 mode in defiance of the hw
384 * docs).
385 *
386 * I've [chadv] experimentally confirmed that, on gfx7 hardware and the
387 * simulator, emitting a f32to16 in align16 mode with UD as destination
388 * data type is safe. The behavior differs from that specified in the PRM
389 * in that the upper word of each destination channel is cleared to 0.
390 */
391
392 dst_reg tmp_dst(this, glsl_type::uvec2_type);
393 src_reg tmp_src(tmp_dst);
394
395 #if 0
396 /* Verify the undocumented behavior on which the following instructions
397 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
398 * then the result of the bit-or instruction below will be incorrect.
399 *
400 * You should inspect the disasm output in order to verify that the MOV is
401 * not optimized away.
402 */
403 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
404 #endif
405
406 /* Give tmp the form below, where "." means untouched.
407 *
408 * w z y x w z y x
409 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
410 *
411 * That the upper word of each write-channel be 0 is required for the
412 * following bit-shift and bit-or instructions to work. Note that this
413 * relies on the undocumented hardware behavior mentioned above.
414 */
415 tmp_dst.writemask = WRITEMASK_XY;
416 emit(F32TO16(tmp_dst, src0));
417
418 /* Give the write-channels of dst the form:
419 * 0xhhhh0000
420 */
421 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
422 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
423
424 /* Finally, give the write-channels of dst the form of packHalf2x16's
425 * output:
426 * 0xhhhhllll
427 */
428 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
429 emit(OR(dst, src_reg(dst), tmp_src));
430 }
431
432 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)433 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
434 {
435 if (devinfo->ver < 7) {
436 unreachable("ir_unop_unpack_half_2x16 should be lowered");
437 }
438
439 assert(dst.type == BRW_REGISTER_TYPE_F);
440 assert(src0.type == BRW_REGISTER_TYPE_UD);
441
442 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
443 *
444 * Because this instruction does not have a 16-bit floating-point type,
445 * the source data type must be Word (W). The destination type must be
446 * F (Float).
447 *
448 * To use W as the source data type, we must adjust horizontal strides,
449 * which is only possible in align1 mode. All my [chadv] attempts at
450 * emitting align1 instructions for unpackHalf2x16 failed to pass the
451 * Piglit tests, so I gave up.
452 *
453 * I've verified that, on gfx7 hardware and the simulator, it is safe to
454 * emit f16to32 in align16 mode with UD as source data type.
455 */
456
457 dst_reg tmp_dst(this, glsl_type::uvec2_type);
458 src_reg tmp_src(tmp_dst);
459
460 tmp_dst.writemask = WRITEMASK_X;
461 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
462
463 tmp_dst.writemask = WRITEMASK_Y;
464 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
465
466 dst.writemask = WRITEMASK_XY;
467 emit(F16TO32(dst, tmp_src));
468 }
469
470 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)471 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
472 {
473 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
474 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
475 * is not suitable to generate the shift values, but we can use the packed
476 * vector float and a type-converting MOV.
477 */
478 dst_reg shift(this, glsl_type::uvec4_type);
479 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
480
481 dst_reg shifted(this, glsl_type::uvec4_type);
482 src0.swizzle = BRW_SWIZZLE_XXXX;
483 emit(SHR(shifted, src0, src_reg(shift)));
484
485 shifted.type = BRW_REGISTER_TYPE_UB;
486 dst_reg f(this, glsl_type::vec4_type);
487 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
488
489 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
490 }
491
492 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)493 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
494 {
495 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
496 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
497 * is not suitable to generate the shift values, but we can use the packed
498 * vector float and a type-converting MOV.
499 */
500 dst_reg shift(this, glsl_type::uvec4_type);
501 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
502
503 dst_reg shifted(this, glsl_type::uvec4_type);
504 src0.swizzle = BRW_SWIZZLE_XXXX;
505 emit(SHR(shifted, src0, src_reg(shift)));
506
507 shifted.type = BRW_REGISTER_TYPE_B;
508 dst_reg f(this, glsl_type::vec4_type);
509 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
510
511 dst_reg scaled(this, glsl_type::vec4_type);
512 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
513
514 dst_reg max(this, glsl_type::vec4_type);
515 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
516 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
517 }
518
519 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)520 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
521 {
522 dst_reg saturated(this, glsl_type::vec4_type);
523 vec4_instruction *inst = emit(MOV(saturated, src0));
524 inst->saturate = true;
525
526 dst_reg scaled(this, glsl_type::vec4_type);
527 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
528
529 dst_reg rounded(this, glsl_type::vec4_type);
530 emit(RNDE(rounded, src_reg(scaled)));
531
532 dst_reg u(this, glsl_type::uvec4_type);
533 emit(MOV(u, src_reg(rounded)));
534
535 src_reg bytes(u);
536 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
537 }
538
539 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)540 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
541 {
542 dst_reg max(this, glsl_type::vec4_type);
543 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
544
545 dst_reg min(this, glsl_type::vec4_type);
546 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
547
548 dst_reg scaled(this, glsl_type::vec4_type);
549 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
550
551 dst_reg rounded(this, glsl_type::vec4_type);
552 emit(RNDE(rounded, src_reg(scaled)));
553
554 dst_reg i(this, glsl_type::ivec4_type);
555 emit(MOV(i, src_reg(rounded)));
556
557 src_reg bytes(i);
558 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
559 }
560
561 /*
562 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
563 * false) elements needed to pack a type.
564 */
565 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4,bool bindless)566 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
567 {
568 unsigned int i;
569 int size;
570
571 switch (type->base_type) {
572 case GLSL_TYPE_UINT:
573 case GLSL_TYPE_INT:
574 case GLSL_TYPE_FLOAT:
575 case GLSL_TYPE_FLOAT16:
576 case GLSL_TYPE_BOOL:
577 case GLSL_TYPE_DOUBLE:
578 case GLSL_TYPE_UINT16:
579 case GLSL_TYPE_INT16:
580 case GLSL_TYPE_UINT8:
581 case GLSL_TYPE_INT8:
582 case GLSL_TYPE_UINT64:
583 case GLSL_TYPE_INT64:
584 if (type->is_matrix()) {
585 const glsl_type *col_type = type->column_type();
586 unsigned col_slots =
587 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
588 return type->matrix_columns * col_slots;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size_xvec4(type->fields.array, as_vec4, bindless) *
600 type->length;
601 case GLSL_TYPE_STRUCT:
602 case GLSL_TYPE_INTERFACE:
603 size = 0;
604 for (i = 0; i < type->length; i++) {
605 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
606 bindless);
607 }
608 return size;
609 case GLSL_TYPE_SUBROUTINE:
610 return 1;
611
612 case GLSL_TYPE_SAMPLER:
613 case GLSL_TYPE_TEXTURE:
614 /* Samplers and textures take up no register space, since they're baked
615 * in at link time.
616 */
617 return bindless ? 1 : 0;
618 case GLSL_TYPE_ATOMIC_UINT:
619 return 0;
620 case GLSL_TYPE_IMAGE:
621 return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
622 case GLSL_TYPE_VOID:
623 case GLSL_TYPE_ERROR:
624 case GLSL_TYPE_FUNCTION:
625 unreachable("not reached");
626 }
627
628 return 0;
629 }
630
631 /**
632 * Returns the minimum number of vec4 elements needed to pack a type.
633 *
634 * For simple types, it will return 1 (a single vec4); for matrices, the
635 * number of columns; for array and struct, the sum of the vec4_size of
636 * each of its elements; and for sampler and atomic, zero.
637 *
638 * This method is useful to calculate how much register space is needed to
639 * store a particular type.
640 */
641 extern "C" int
type_size_vec4(const struct glsl_type * type,bool bindless)642 type_size_vec4(const struct glsl_type *type, bool bindless)
643 {
644 return type_size_xvec4(type, true, bindless);
645 }
646
647 /**
648 * Returns the minimum number of dvec4 elements needed to pack a type.
649 *
650 * For simple types, it will return 1 (a single dvec4); for matrices, the
651 * number of columns; for array and struct, the sum of the dvec4_size of
652 * each of its elements; and for sampler and atomic, zero.
653 *
654 * This method is useful to calculate how much register space is needed to
655 * store a particular type.
656 *
657 * Measuring double-precision vertex inputs as dvec4 is required because
658 * ARB_vertex_attrib_64bit states that these uses the same number of locations
659 * than the single-precision version. That is, two consecutives dvec4 would be
660 * located in location "x" and location "x+1", not "x+2".
661 *
662 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
663 * remap_vs_attrs() will take in account both the location and also if the
664 * type fits in one or two vec4 slots.
665 */
666 extern "C" int
type_size_dvec4(const struct glsl_type * type,bool bindless)667 type_size_dvec4(const struct glsl_type *type, bool bindless)
668 {
669 return type_size_xvec4(type, false, bindless);
670 }
671
src_reg(class vec4_visitor * v,const struct glsl_type * type)672 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
673 {
674 init();
675
676 this->file = VGRF;
677 this->nr = v->alloc.allocate(type_size_vec4(type, false));
678
679 if (type->is_array() || type->is_struct()) {
680 this->swizzle = BRW_SWIZZLE_NOOP;
681 } else {
682 this->swizzle = brw_swizzle_for_size(type->vector_elements);
683 }
684
685 this->type = brw_type_for_base_type(type);
686 }
687
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)688 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
689 {
690 assert(size > 0);
691
692 init();
693
694 this->file = VGRF;
695 this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
696
697 this->swizzle = BRW_SWIZZLE_NOOP;
698
699 this->type = brw_type_for_base_type(type);
700 }
701
dst_reg(class vec4_visitor * v,const struct glsl_type * type)702 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
703 {
704 init();
705
706 this->file = VGRF;
707 this->nr = v->alloc.allocate(type_size_vec4(type, false));
708
709 if (type->is_array() || type->is_struct()) {
710 this->writemask = WRITEMASK_XYZW;
711 } else {
712 this->writemask = (1 << type->vector_elements) - 1;
713 }
714
715 this->type = brw_type_for_base_type(type);
716 }
717
718 vec4_instruction *
emit_minmax(enum brw_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)719 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
720 src_reg src0, src_reg src1)
721 {
722 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
723 inst->conditional_mod = conditionalmod;
724 return inst;
725 }
726
727 /**
728 * Emits the instructions needed to perform a pull constant load. before_block
729 * and before_inst can be NULL in which case the instruction will be appended
730 * to the end of the instruction list.
731 */
732 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,bblock_t * before_block,vec4_instruction * before_inst)733 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
734 src_reg surf_index,
735 src_reg offset_reg,
736 bblock_t *before_block,
737 vec4_instruction *before_inst)
738 {
739 assert((before_inst == NULL && before_block == NULL) ||
740 (before_inst && before_block));
741
742 vec4_instruction *pull;
743
744 if (devinfo->ver >= 7) {
745 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
746
747 grf_offset.type = offset_reg.type;
748
749 pull = MOV(grf_offset, offset_reg);
750
751 if (before_inst)
752 emit_before(before_block, before_inst, pull);
753 else
754 emit(pull);
755
756 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GFX7,
757 dst,
758 surf_index,
759 src_reg(grf_offset));
760 pull->mlen = 1;
761 } else {
762 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
763 dst,
764 surf_index,
765 offset_reg);
766 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
767 pull->mlen = 1;
768 }
769
770 if (before_inst)
771 emit_before(before_block, before_inst, pull);
772 else
773 emit(pull);
774 }
775
776 src_reg
emit_uniformize(const src_reg & src)777 vec4_visitor::emit_uniformize(const src_reg &src)
778 {
779 const src_reg chan_index(this, glsl_type::uint_type);
780 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
781 src.type);
782
783 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
784 ->force_writemask_all = true;
785 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
786 ->force_writemask_all = true;
787
788 return src_reg(dst);
789 }
790
791 void
gs_emit_vertex(int)792 vec4_visitor::gs_emit_vertex(int /* stream_id */)
793 {
794 unreachable("not reached");
795 }
796
797 void
gs_end_primitive()798 vec4_visitor::gs_end_primitive()
799 {
800 unreachable("not reached");
801 }
802
803 void
emit_ndc_computation()804 vec4_visitor::emit_ndc_computation()
805 {
806 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
807 return;
808
809 /* Get the position */
810 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
811
812 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
813 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
814 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
815 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
816
817 current_annotation = "NDC";
818 dst_reg ndc_w = ndc;
819 ndc_w.writemask = WRITEMASK_W;
820 src_reg pos_w = pos;
821 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
822 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
823
824 dst_reg ndc_xyz = ndc;
825 ndc_xyz.writemask = WRITEMASK_XYZ;
826
827 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
828 }
829
830 void
emit_psiz_and_flags(dst_reg reg)831 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
832 {
833 if (devinfo->ver < 6 &&
834 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
835 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
836 devinfo->has_negative_rhw_bug)) {
837 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
838 dst_reg header1_w = header1;
839 header1_w.writemask = WRITEMASK_W;
840
841 emit(MOV(header1, brw_imm_ud(0u)));
842
843 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
844 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
845
846 current_annotation = "Point size";
847 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
848 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
849 }
850
851 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
852 current_annotation = "Clipping flags";
853 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
854
855 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
856 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
857 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
858 }
859
860 if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
861 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
862 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
863 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
864 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
865 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
866 }
867
868 /* i965 clipping workaround:
869 * 1) Test for -ve rhw
870 * 2) If set,
871 * set ndc = (0,0,0,0)
872 * set ucp[6] = 1
873 *
874 * Later, clipping will detect ucp[6] and ensure the primitive is
875 * clipped against all fixed planes.
876 */
877 if (devinfo->has_negative_rhw_bug &&
878 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
879 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
880 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
881 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
882 vec4_instruction *inst;
883 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
884 inst->predicate = BRW_PREDICATE_NORMAL;
885 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
886 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
887 inst->predicate = BRW_PREDICATE_NORMAL;
888 }
889
890 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
891 } else if (devinfo->ver < 6) {
892 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
893 } else {
894 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
895 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
896 dst_reg reg_w = reg;
897 reg_w.writemask = WRITEMASK_W;
898 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
899 reg_as_src.type = reg_w.type;
900 reg_as_src.swizzle = brw_swizzle_for_size(1);
901 emit(MOV(reg_w, reg_as_src));
902 }
903 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
904 dst_reg reg_y = reg;
905 reg_y.writemask = WRITEMASK_Y;
906 reg_y.type = BRW_REGISTER_TYPE_D;
907 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
908 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
909 }
910 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
911 dst_reg reg_z = reg;
912 reg_z.writemask = WRITEMASK_Z;
913 reg_z.type = BRW_REGISTER_TYPE_D;
914 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
915 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
916 }
917 }
918 }
919
920 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)921 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
922 {
923 assert(varying < VARYING_SLOT_MAX);
924
925 unsigned num_comps = output_num_components[varying][component];
926 if (num_comps == 0)
927 return NULL;
928
929 assert(output_reg[varying][component].type == reg.type);
930 current_annotation = output_reg_annotation[varying];
931 if (output_reg[varying][component].file != BAD_FILE) {
932 src_reg src = src_reg(output_reg[varying][component]);
933 src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
934 reg.writemask =
935 brw_writemask_for_component_packing(num_comps, component);
936 return emit(MOV(reg, src));
937 }
938 return NULL;
939 }
940
941 void
emit_urb_slot(dst_reg reg,int varying)942 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
943 {
944 reg.type = BRW_REGISTER_TYPE_F;
945 output_reg[varying][0].type = reg.type;
946
947 switch (varying) {
948 case VARYING_SLOT_PSIZ:
949 {
950 /* PSIZ is always in slot 0, and is coupled with other flags. */
951 current_annotation = "indices, point width, clip flags";
952 emit_psiz_and_flags(reg);
953 break;
954 }
955 case BRW_VARYING_SLOT_NDC:
956 current_annotation = "NDC";
957 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
958 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
959 break;
960 case VARYING_SLOT_POS:
961 current_annotation = "gl_Position";
962 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
963 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
964 break;
965 case BRW_VARYING_SLOT_PAD:
966 /* No need to write to this slot */
967 break;
968 default:
969 for (int i = 0; i < 4; i++) {
970 emit_generic_urb_slot(reg, varying, i);
971 }
972 break;
973 }
974 }
975
976 static unsigned
align_interleaved_urb_mlen(const struct intel_device_info * devinfo,unsigned mlen)977 align_interleaved_urb_mlen(const struct intel_device_info *devinfo,
978 unsigned mlen)
979 {
980 if (devinfo->ver >= 6) {
981 /* URB data written (does not include the message header reg) must
982 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
983 * section 5.4.3.2.2: URB_INTERLEAVED.
984 *
985 * URB entries are allocated on a multiple of 1024 bits, so an
986 * extra 128 bits written here to make the end align to 256 is
987 * no problem.
988 */
989 if ((mlen % 2) != 1)
990 mlen++;
991 }
992
993 return mlen;
994 }
995
996
997 /**
998 * Generates the VUE payload plus the necessary URB write instructions to
999 * output it.
1000 *
1001 * The VUE layout is documented in Volume 2a.
1002 */
1003 void
emit_vertex()1004 vec4_visitor::emit_vertex()
1005 {
1006 /* MRF 0 is reserved for the debugger, so start with message header
1007 * in MRF 1.
1008 */
1009 int base_mrf = 1;
1010 int mrf = base_mrf;
1011 /* In the process of generating our URB write message contents, we
1012 * may need to unspill a register or load from an array. Those
1013 * reads would use MRFs 14-15.
1014 */
1015 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
1016
1017 /* The following assertion verifies that max_usable_mrf causes an
1018 * even-numbered amount of URB write data, which will meet gfx6's
1019 * requirements for length alignment.
1020 */
1021 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1022
1023 /* First mrf is the g0-based message header containing URB handles and
1024 * such.
1025 */
1026 emit_urb_write_header(mrf++);
1027
1028 if (devinfo->ver < 6) {
1029 emit_ndc_computation();
1030 }
1031
1032 /* We may need to split this up into several URB writes, so do them in a
1033 * loop.
1034 */
1035 int slot = 0;
1036 bool complete = false;
1037 do {
1038 /* URB offset is in URB row increments, and each of our MRFs is half of
1039 * one of those, since we're doing interleaved writes.
1040 */
1041 int offset = slot / 2;
1042
1043 mrf = base_mrf + 1;
1044 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1045 emit_urb_slot(dst_reg(MRF, mrf++),
1046 prog_data->vue_map.slot_to_varying[slot]);
1047
1048 /* If this was max_usable_mrf, we can't fit anything more into this
1049 * URB WRITE. Same thing if we reached the maximum length available.
1050 */
1051 if (mrf > max_usable_mrf ||
1052 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1053 slot++;
1054 break;
1055 }
1056 }
1057
1058 complete = slot >= prog_data->vue_map.num_slots;
1059 current_annotation = "URB write";
1060 vec4_instruction *inst = emit_urb_write_opcode(complete);
1061 inst->base_mrf = base_mrf;
1062 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1063 inst->offset += offset;
1064 } while(!complete);
1065 }
1066
1067
1068 src_reg
get_scratch_offset(bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1069 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1070 src_reg *reladdr, int reg_offset)
1071 {
1072 /* Because we store the values to scratch interleaved like our
1073 * vertex data, we need to scale the vec4 index by 2.
1074 */
1075 int message_header_scale = 2;
1076
1077 /* Pre-gfx6, the message header uses byte offsets instead of vec4
1078 * (16-byte) offset units.
1079 */
1080 if (devinfo->ver < 6)
1081 message_header_scale *= 16;
1082
1083 if (reladdr) {
1084 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1085 * to multiply the reladdr by 2. Notice that the reg_offset part
1086 * is in units of 16 bytes and is used to select the low/high 16-byte
1087 * chunk of a full dvec4, so we don't want to multiply that part.
1088 */
1089 src_reg index = src_reg(this, glsl_type::int_type);
1090 if (type_sz(inst->dst.type) < 8) {
1091 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1092 brw_imm_d(reg_offset)));
1093 emit_before(block, inst, MUL(dst_reg(index), index,
1094 brw_imm_d(message_header_scale)));
1095 } else {
1096 emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1097 brw_imm_d(message_header_scale * 2)));
1098 emit_before(block, inst, ADD(dst_reg(index), index,
1099 brw_imm_d(reg_offset * message_header_scale)));
1100 }
1101 return index;
1102 } else {
1103 return brw_imm_d(reg_offset * message_header_scale);
1104 }
1105 }
1106
1107 /**
1108 * Emits an instruction before @inst to load the value named by @orig_src
1109 * from scratch space at @base_offset to @temp.
1110 *
1111 * @base_offset is measured in 32-byte units (the size of a register).
1112 */
1113 void
emit_scratch_read(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1114 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1115 dst_reg temp, src_reg orig_src,
1116 int base_offset)
1117 {
1118 assert(orig_src.offset % REG_SIZE == 0);
1119 int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1120 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1121 reg_offset);
1122
1123 if (type_sz(orig_src.type) < 8) {
1124 emit_before(block, inst, SCRATCH_READ(temp, index));
1125 } else {
1126 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1127 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1128 emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1129 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1130 vec4_instruction *last_read =
1131 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1132 emit_before(block, inst, last_read);
1133 shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read);
1134 }
1135 }
1136
1137 /**
1138 * Emits an instruction after @inst to store the value to be written
1139 * to @orig_dst to scratch space at @base_offset, from @temp.
1140 *
1141 * @base_offset is measured in 32-byte units (the size of a register).
1142 */
1143 void
emit_scratch_write(bblock_t * block,vec4_instruction * inst,int base_offset)1144 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1145 int base_offset)
1146 {
1147 assert(inst->dst.offset % REG_SIZE == 0);
1148 int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1149 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1150 reg_offset);
1151
1152 /* Create a temporary register to store *inst's result in.
1153 *
1154 * We have to be careful in MOVing from our temporary result register in
1155 * the scratch write. If we swizzle from channels of the temporary that
1156 * weren't initialized, it will confuse live interval analysis, which will
1157 * make spilling fail to make progress.
1158 */
1159 bool is_64bit = type_sz(inst->dst.type) == 8;
1160 const glsl_type *alloc_type =
1161 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1162 const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1163 inst->dst.type),
1164 brw_swizzle_for_mask(inst->dst.writemask));
1165
1166 if (!is_64bit) {
1167 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1168 inst->dst.writemask));
1169 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1170 if (inst->opcode != BRW_OPCODE_SEL)
1171 write->predicate = inst->predicate;
1172 write->ir = inst->ir;
1173 write->annotation = inst->annotation;
1174 inst->insert_after(block, write);
1175 } else {
1176 dst_reg shuffled = dst_reg(this, alloc_type);
1177 vec4_instruction *last =
1178 shuffle_64bit_data(shuffled, temp, true, true, block, inst);
1179 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1180
1181 uint8_t mask = 0;
1182 if (inst->dst.writemask & WRITEMASK_X)
1183 mask |= WRITEMASK_XY;
1184 if (inst->dst.writemask & WRITEMASK_Y)
1185 mask |= WRITEMASK_ZW;
1186 if (mask) {
1187 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1188
1189 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1190 if (inst->opcode != BRW_OPCODE_SEL)
1191 write->predicate = inst->predicate;
1192 write->ir = inst->ir;
1193 write->annotation = inst->annotation;
1194 last->insert_after(block, write);
1195 }
1196
1197 mask = 0;
1198 if (inst->dst.writemask & WRITEMASK_Z)
1199 mask |= WRITEMASK_XY;
1200 if (inst->dst.writemask & WRITEMASK_W)
1201 mask |= WRITEMASK_ZW;
1202 if (mask) {
1203 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1204
1205 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1206 reg_offset + 1);
1207 vec4_instruction *write =
1208 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1209 if (inst->opcode != BRW_OPCODE_SEL)
1210 write->predicate = inst->predicate;
1211 write->ir = inst->ir;
1212 write->annotation = inst->annotation;
1213 last->insert_after(block, write);
1214 }
1215 }
1216
1217 inst->dst.file = temp.file;
1218 inst->dst.nr = temp.nr;
1219 inst->dst.offset %= REG_SIZE;
1220 inst->dst.reladdr = NULL;
1221 }
1222
1223 /**
1224 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1225 * adds the scratch read(s) before \p inst. The function also checks for
1226 * recursive reladdr scratch accesses, issuing the corresponding scratch
1227 * loads and rewriting reladdr references accordingly.
1228 *
1229 * \return \p src if it did not require a scratch load, otherwise, the
1230 * register holding the result of the scratch load that the caller should
1231 * use to rewrite src.
1232 */
1233 src_reg
emit_resolve_reladdr(int scratch_loc[],bblock_t * block,vec4_instruction * inst,src_reg src)1234 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1235 vec4_instruction *inst, src_reg src)
1236 {
1237 /* Resolve recursive reladdr scratch access by calling ourselves
1238 * with src.reladdr
1239 */
1240 if (src.reladdr)
1241 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1242 *src.reladdr);
1243
1244 /* Now handle scratch access on src */
1245 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1246 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1247 glsl_type::dvec4_type : glsl_type::vec4_type);
1248 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1249 src.nr = temp.nr;
1250 src.offset %= REG_SIZE;
1251 src.reladdr = NULL;
1252 }
1253
1254 return src;
1255 }
1256
1257 /**
1258 * We can't generally support array access in GRF space, because a
1259 * single instruction's destination can only span 2 contiguous
1260 * registers. So, we send all GRF arrays that get variable index
1261 * access to scratch space.
1262 */
1263 void
move_grf_array_access_to_scratch()1264 vec4_visitor::move_grf_array_access_to_scratch()
1265 {
1266 int scratch_loc[this->alloc.count];
1267 memset(scratch_loc, -1, sizeof(scratch_loc));
1268
1269 /* First, calculate the set of virtual GRFs that need to be punted
1270 * to scratch due to having any array access on them, and where in
1271 * scratch.
1272 */
1273 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1274 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1275 if (scratch_loc[inst->dst.nr] == -1) {
1276 scratch_loc[inst->dst.nr] = last_scratch;
1277 last_scratch += this->alloc.sizes[inst->dst.nr];
1278 }
1279
1280 for (src_reg *iter = inst->dst.reladdr;
1281 iter->reladdr;
1282 iter = iter->reladdr) {
1283 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1284 scratch_loc[iter->nr] = last_scratch;
1285 last_scratch += this->alloc.sizes[iter->nr];
1286 }
1287 }
1288 }
1289
1290 for (int i = 0 ; i < 3; i++) {
1291 for (src_reg *iter = &inst->src[i];
1292 iter->reladdr;
1293 iter = iter->reladdr) {
1294 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1295 scratch_loc[iter->nr] = last_scratch;
1296 last_scratch += this->alloc.sizes[iter->nr];
1297 }
1298 }
1299 }
1300 }
1301
1302 /* Now, for anything that will be accessed through scratch, rewrite
1303 * it to load/store. Note that this is a _safe list walk, because
1304 * we may generate a new scratch_write instruction after the one
1305 * we're processing.
1306 */
1307 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1308 /* Set up the annotation tracking for new generated instructions. */
1309 base_ir = inst->ir;
1310 current_annotation = inst->annotation;
1311
1312 /* First handle scratch access on the dst. Notice we have to handle
1313 * the case where the dst's reladdr also points to scratch space.
1314 */
1315 if (inst->dst.reladdr)
1316 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1317 *inst->dst.reladdr);
1318
1319 /* Now that we have handled any (possibly recursive) reladdr scratch
1320 * accesses for dst we can safely do the scratch write for dst itself
1321 */
1322 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1323 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1324
1325 /* Now handle scratch access on any src. In this case, since inst->src[i]
1326 * already is a src_reg, we can just call emit_resolve_reladdr with
1327 * inst->src[i] and it will take care of handling scratch loads for
1328 * both src and src.reladdr (recursively).
1329 */
1330 for (int i = 0 ; i < 3; i++) {
1331 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1332 inst->src[i]);
1333 }
1334 }
1335 }
1336
1337 void
resolve_ud_negate(src_reg * reg)1338 vec4_visitor::resolve_ud_negate(src_reg *reg)
1339 {
1340 if (reg->type != BRW_REGISTER_TYPE_UD ||
1341 !reg->negate)
1342 return;
1343
1344 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1345 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1346 *reg = temp;
1347 }
1348
vec4_visitor(const struct brw_compiler * compiler,void * log_data,const struct brw_sampler_prog_key_data * key_tex,struct brw_vue_prog_data * prog_data,const nir_shader * shader,void * mem_ctx,bool no_spills,bool debug_enabled)1349 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1350 void *log_data,
1351 const struct brw_sampler_prog_key_data *key_tex,
1352 struct brw_vue_prog_data *prog_data,
1353 const nir_shader *shader,
1354 void *mem_ctx,
1355 bool no_spills,
1356 bool debug_enabled)
1357 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base,
1358 debug_enabled),
1359 key_tex(key_tex),
1360 prog_data(prog_data),
1361 fail_msg(NULL),
1362 first_non_payload_grf(0),
1363 ubo_push_start(),
1364 push_length(0),
1365 live_analysis(this), performance_analysis(this),
1366 need_all_constants_in_pull_buffer(false),
1367 no_spills(no_spills),
1368 last_scratch(0)
1369 {
1370 this->failed = false;
1371
1372 this->base_ir = NULL;
1373 this->current_annotation = NULL;
1374 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1375
1376 memset(this->output_num_components, 0, sizeof(this->output_num_components));
1377
1378 this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
1379
1380 this->uniforms = 0;
1381
1382 this->nir_locals = NULL;
1383 this->nir_ssa_values = NULL;
1384 }
1385
1386
1387 void
fail(const char * format,...)1388 vec4_visitor::fail(const char *format, ...)
1389 {
1390 va_list va;
1391 char *msg;
1392
1393 if (failed)
1394 return;
1395
1396 failed = true;
1397
1398 va_start(va, format);
1399 msg = ralloc_vasprintf(mem_ctx, format, va);
1400 va_end(va);
1401 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1402
1403 this->fail_msg = msg;
1404
1405 if (unlikely(debug_enabled)) {
1406 fprintf(stderr, "%s", msg);
1407 }
1408 }
1409
1410 } /* namespace brw */
1411