1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27
28 namespace brw {
29
vec4_instruction(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)30 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
31 const src_reg &src0, const src_reg &src1,
32 const src_reg &src2)
33 {
34 this->opcode = opcode;
35 this->dst = dst;
36 this->src[0] = src0;
37 this->src[1] = src1;
38 this->src[2] = src2;
39 this->saturate = false;
40 this->force_writemask_all = false;
41 this->no_dd_clear = false;
42 this->no_dd_check = false;
43 this->writes_accumulator = false;
44 this->conditional_mod = BRW_CONDITIONAL_NONE;
45 this->predicate = BRW_PREDICATE_NONE;
46 this->predicate_inverse = false;
47 this->target = 0;
48 this->shadow_compare = false;
49 this->eot = false;
50 this->ir = NULL;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_size = 0;
53 this->flag_subreg = 0;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->exec_size = 8;
58 this->group = 0;
59 this->size_written = (dst.file == BAD_FILE ?
60 0 : this->exec_size * type_sz(dst.type));
61 this->annotation = NULL;
62 }
63
64 vec4_instruction *
emit(vec4_instruction * inst)65 vec4_visitor::emit(vec4_instruction *inst)
66 {
67 inst->ir = this->base_ir;
68 inst->annotation = this->current_annotation;
69
70 this->instructions.push_tail(inst);
71
72 return inst;
73 }
74
75 vec4_instruction *
emit_before(bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)76 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
77 vec4_instruction *new_inst)
78 {
79 new_inst->ir = inst->ir;
80 new_inst->annotation = inst->annotation;
81
82 inst->insert_before(block, new_inst);
83
84 return inst;
85 }
86
87 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
89 const src_reg &src1, const src_reg &src2)
90 {
91 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
92 }
93
94
95 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)96 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
97 const src_reg &src1)
98 {
99 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
100 }
101
102 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
104 {
105 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
106 }
107
108 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst)109 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
110 {
111 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
112 }
113
114 vec4_instruction *
emit(enum opcode opcode)115 vec4_visitor::emit(enum opcode opcode)
116 {
117 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
118 }
119
120 #define ALU1(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
123 { \
124 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
125 }
126
127 #define ALU2(op) \
128 vec4_instruction * \
129 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
130 const src_reg &src1) \
131 { \
132 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
133 src0, src1); \
134 }
135
136 #define ALU2_ACC(op) \
137 vec4_instruction * \
138 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
139 const src_reg &src1) \
140 { \
141 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
142 BRW_OPCODE_##op, dst, src0, src1); \
143 inst->writes_accumulator = true; \
144 return inst; \
145 }
146
147 #define ALU3(op) \
148 vec4_instruction * \
149 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
150 const src_reg &src1, const src_reg &src2) \
151 { \
152 assert(devinfo->gen >= 6); \
153 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
154 src0, src1, src2); \
155 }
156
157 ALU1(NOT)
ALU1(MOV)158 ALU1(MOV)
159 ALU1(FRC)
160 ALU1(RNDD)
161 ALU1(RNDE)
162 ALU1(RNDZ)
163 ALU1(F32TO16)
164 ALU1(F16TO32)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2_ACC(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(DP3)
172 ALU2(DP4)
173 ALU2(DPH)
174 ALU2(SHL)
175 ALU2(SHR)
176 ALU2(ASR)
177 ALU3(LRP)
178 ALU1(BFREV)
179 ALU3(BFE)
180 ALU2(BFI1)
181 ALU3(BFI2)
182 ALU1(FBH)
183 ALU1(FBL)
184 ALU1(CBIT)
185 ALU3(MAD)
186 ALU2_ACC(ADDC)
187 ALU2_ACC(SUBB)
188 ALU2(MAC)
189 ALU1(DIM)
190
191 /** Gen4 predicated IF. */
192 vec4_instruction *
193 vec4_visitor::IF(enum brw_predicate predicate)
194 {
195 vec4_instruction *inst;
196
197 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
198 inst->predicate = predicate;
199
200 return inst;
201 }
202
203 /** Gen6 IF with embedded comparison. */
204 vec4_instruction *
IF(src_reg src0,src_reg src1,enum brw_conditional_mod condition)205 vec4_visitor::IF(src_reg src0, src_reg src1,
206 enum brw_conditional_mod condition)
207 {
208 assert(devinfo->gen == 6);
209
210 vec4_instruction *inst;
211
212 resolve_ud_negate(&src0);
213 resolve_ud_negate(&src1);
214
215 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
216 src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 /**
223 * CMP: Sets the low bit of the destination channels with the result
224 * of the comparison, while the upper bits are undefined, and updates
225 * the flag register with the packed 16 bits of the result.
226 */
227 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum brw_conditional_mod condition)228 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
229 enum brw_conditional_mod condition)
230 {
231 vec4_instruction *inst;
232
233 /* Take the instruction:
234 *
235 * CMP null<d> src0<f> src1<f>
236 *
237 * Original gen4 does type conversion to the destination type before
238 * comparison, producing garbage results for floating point comparisons.
239 *
240 * The destination type doesn't matter on newer generations, so we set the
241 * type to match src0 so we can compact the instruction.
242 */
243 dst.type = src0.type;
244
245 resolve_ud_negate(&src0);
246 resolve_ud_negate(&src1);
247
248 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
249 inst->conditional_mod = condition;
250
251 return inst;
252 }
253
254 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
256 {
257 vec4_instruction *inst;
258
259 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
260 dst, index);
261 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
262 inst->mlen = 2;
263
264 return inst;
265 }
266
267 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
269 const src_reg &index)
270 {
271 vec4_instruction *inst;
272
273 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
274 dst, src, index);
275 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
276 inst->mlen = 3;
277
278 return inst;
279 }
280
281 src_reg
fix_3src_operand(const src_reg & src)282 vec4_visitor::fix_3src_operand(const src_reg &src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
304 return src_reg(expanded);
305 }
306
307 src_reg
resolve_source_modifiers(const src_reg & src)308 vec4_visitor::resolve_source_modifiers(const src_reg &src)
309 {
310 if (!src.abs && !src.negate)
311 return src;
312
313 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
314 resolved.type = src.type;
315 emit(MOV(resolved, src));
316
317 return src_reg(resolved);
318 }
319
320 src_reg
fix_math_operand(const src_reg & src)321 vec4_visitor::fix_math_operand(const src_reg &src)
322 {
323 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
324 return src;
325
326 /* The gen6 math instruction ignores the source modifiers --
327 * swizzle, abs, negate, and at least some parts of the register
328 * region description.
329 *
330 * Rather than trying to enumerate all these cases, *always* expand the
331 * operand to a temp GRF for gen6.
332 *
333 * For gen7, keep the operand as-is, except if immediate, which gen7 still
334 * can't use.
335 */
336
337 if (devinfo->gen == 7 && src.file != IMM)
338 return src;
339
340 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
341 expanded.type = src.type;
342 emit(MOV(expanded, src));
343 return src_reg(expanded);
344 }
345
346 vec4_instruction *
emit_math(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)347 vec4_visitor::emit_math(enum opcode opcode,
348 const dst_reg &dst,
349 const src_reg &src0, const src_reg &src1)
350 {
351 vec4_instruction *math =
352 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
353
354 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
355 /* MATH on Gen6 must be align1, so we can't do writemasks. */
356 math->dst = dst_reg(this, glsl_type::vec4_type);
357 math->dst.type = dst.type;
358 math = emit(MOV(dst, src_reg(math->dst)));
359 } else if (devinfo->gen < 6) {
360 math->base_mrf = 1;
361 math->mlen = src1.file == BAD_FILE ? 1 : 2;
362 }
363
364 return math;
365 }
366
367 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)368 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
369 {
370 if (devinfo->gen < 7) {
371 unreachable("ir_unop_pack_half_2x16 should be lowered");
372 }
373
374 assert(dst.type == BRW_REGISTER_TYPE_UD);
375 assert(src0.type == BRW_REGISTER_TYPE_F);
376
377 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
378 *
379 * Because this instruction does not have a 16-bit floating-point type,
380 * the destination data type must be Word (W).
381 *
382 * The destination must be DWord-aligned and specify a horizontal stride
383 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
384 * each destination channel and the upper word is not modified.
385 *
386 * The above restriction implies that the f32to16 instruction must use
387 * align1 mode, because only in align1 mode is it possible to specify
388 * horizontal stride. We choose here to defy the hardware docs and emit
389 * align16 instructions.
390 *
391 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
392 * instructions. I was partially successful in that the code passed all
393 * tests. However, the code was dubiously correct and fragile, and the
394 * tests were not harsh enough to probe that frailty. Not trusting the
395 * code, I chose instead to remain in align16 mode in defiance of the hw
396 * docs).
397 *
398 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
399 * simulator, emitting a f32to16 in align16 mode with UD as destination
400 * data type is safe. The behavior differs from that specified in the PRM
401 * in that the upper word of each destination channel is cleared to 0.
402 */
403
404 dst_reg tmp_dst(this, glsl_type::uvec2_type);
405 src_reg tmp_src(tmp_dst);
406
407 #if 0
408 /* Verify the undocumented behavior on which the following instructions
409 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
410 * then the result of the bit-or instruction below will be incorrect.
411 *
412 * You should inspect the disasm output in order to verify that the MOV is
413 * not optimized away.
414 */
415 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
416 #endif
417
418 /* Give tmp the form below, where "." means untouched.
419 *
420 * w z y x w z y x
421 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
422 *
423 * That the upper word of each write-channel be 0 is required for the
424 * following bit-shift and bit-or instructions to work. Note that this
425 * relies on the undocumented hardware behavior mentioned above.
426 */
427 tmp_dst.writemask = WRITEMASK_XY;
428 emit(F32TO16(tmp_dst, src0));
429
430 /* Give the write-channels of dst the form:
431 * 0xhhhh0000
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
434 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
435
436 /* Finally, give the write-channels of dst the form of packHalf2x16's
437 * output:
438 * 0xhhhhllll
439 */
440 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
441 emit(OR(dst, src_reg(dst), tmp_src));
442 }
443
444 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)445 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
446 {
447 if (devinfo->gen < 7) {
448 unreachable("ir_unop_unpack_half_2x16 should be lowered");
449 }
450
451 assert(dst.type == BRW_REGISTER_TYPE_F);
452 assert(src0.type == BRW_REGISTER_TYPE_UD);
453
454 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
455 *
456 * Because this instruction does not have a 16-bit floating-point type,
457 * the source data type must be Word (W). The destination type must be
458 * F (Float).
459 *
460 * To use W as the source data type, we must adjust horizontal strides,
461 * which is only possible in align1 mode. All my [chadv] attempts at
462 * emitting align1 instructions for unpackHalf2x16 failed to pass the
463 * Piglit tests, so I gave up.
464 *
465 * I've verified that, on gen7 hardware and the simulator, it is safe to
466 * emit f16to32 in align16 mode with UD as source data type.
467 */
468
469 dst_reg tmp_dst(this, glsl_type::uvec2_type);
470 src_reg tmp_src(tmp_dst);
471
472 tmp_dst.writemask = WRITEMASK_X;
473 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
474
475 tmp_dst.writemask = WRITEMASK_Y;
476 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
477
478 dst.writemask = WRITEMASK_XY;
479 emit(F16TO32(dst, tmp_src));
480 }
481
482 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)483 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
484 {
485 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
486 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
487 * is not suitable to generate the shift values, but we can use the packed
488 * vector float and a type-converting MOV.
489 */
490 dst_reg shift(this, glsl_type::uvec4_type);
491 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
492
493 dst_reg shifted(this, glsl_type::uvec4_type);
494 src0.swizzle = BRW_SWIZZLE_XXXX;
495 emit(SHR(shifted, src0, src_reg(shift)));
496
497 shifted.type = BRW_REGISTER_TYPE_UB;
498 dst_reg f(this, glsl_type::vec4_type);
499 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
500
501 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
502 }
503
504 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)505 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
506 {
507 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
508 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
509 * is not suitable to generate the shift values, but we can use the packed
510 * vector float and a type-converting MOV.
511 */
512 dst_reg shift(this, glsl_type::uvec4_type);
513 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
514
515 dst_reg shifted(this, glsl_type::uvec4_type);
516 src0.swizzle = BRW_SWIZZLE_XXXX;
517 emit(SHR(shifted, src0, src_reg(shift)));
518
519 shifted.type = BRW_REGISTER_TYPE_B;
520 dst_reg f(this, glsl_type::vec4_type);
521 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
522
523 dst_reg scaled(this, glsl_type::vec4_type);
524 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
525
526 dst_reg max(this, glsl_type::vec4_type);
527 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
528 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
529 }
530
531 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)532 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
533 {
534 dst_reg saturated(this, glsl_type::vec4_type);
535 vec4_instruction *inst = emit(MOV(saturated, src0));
536 inst->saturate = true;
537
538 dst_reg scaled(this, glsl_type::vec4_type);
539 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
540
541 dst_reg rounded(this, glsl_type::vec4_type);
542 emit(RNDE(rounded, src_reg(scaled)));
543
544 dst_reg u(this, glsl_type::uvec4_type);
545 emit(MOV(u, src_reg(rounded)));
546
547 src_reg bytes(u);
548 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
549 }
550
551 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)552 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
553 {
554 dst_reg max(this, glsl_type::vec4_type);
555 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
556
557 dst_reg min(this, glsl_type::vec4_type);
558 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
559
560 dst_reg scaled(this, glsl_type::vec4_type);
561 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
562
563 dst_reg rounded(this, glsl_type::vec4_type);
564 emit(RNDE(rounded, src_reg(scaled)));
565
566 dst_reg i(this, glsl_type::ivec4_type);
567 emit(MOV(i, src_reg(rounded)));
568
569 src_reg bytes(i);
570 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
571 }
572
573 /*
574 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
575 * false) elements needed to pack a type.
576 */
577 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4)578 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
579 {
580 unsigned int i;
581 int size;
582
583 switch (type->base_type) {
584 case GLSL_TYPE_UINT:
585 case GLSL_TYPE_INT:
586 case GLSL_TYPE_FLOAT:
587 case GLSL_TYPE_FLOAT16:
588 case GLSL_TYPE_BOOL:
589 case GLSL_TYPE_DOUBLE:
590 case GLSL_TYPE_UINT16:
591 case GLSL_TYPE_INT16:
592 case GLSL_TYPE_UINT64:
593 case GLSL_TYPE_INT64:
594 if (type->is_matrix()) {
595 const glsl_type *col_type = type->column_type();
596 unsigned col_slots =
597 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
598 return type->matrix_columns * col_slots;
599 } else {
600 /* Regardless of size of vector, it gets a vec4. This is bad
601 * packing for things like floats, but otherwise arrays become a
602 * mess. Hopefully a later pass over the code can pack scalars
603 * down if appropriate.
604 */
605 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
606 }
607 case GLSL_TYPE_ARRAY:
608 assert(type->length > 0);
609 return type_size_xvec4(type->fields.array, as_vec4) * type->length;
610 case GLSL_TYPE_STRUCT:
611 size = 0;
612 for (i = 0; i < type->length; i++) {
613 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
614 }
615 return size;
616 case GLSL_TYPE_SUBROUTINE:
617 return 1;
618
619 case GLSL_TYPE_SAMPLER:
620 /* Samplers take up no register space, since they're baked in at
621 * link time.
622 */
623 return 0;
624 case GLSL_TYPE_ATOMIC_UINT:
625 return 0;
626 case GLSL_TYPE_IMAGE:
627 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
628 case GLSL_TYPE_VOID:
629 case GLSL_TYPE_ERROR:
630 case GLSL_TYPE_INTERFACE:
631 case GLSL_TYPE_FUNCTION:
632 unreachable("not reached");
633 }
634
635 return 0;
636 }
637
638 /**
639 * Returns the minimum number of vec4 elements needed to pack a type.
640 *
641 * For simple types, it will return 1 (a single vec4); for matrices, the
642 * number of columns; for array and struct, the sum of the vec4_size of
643 * each of its elements; and for sampler and atomic, zero.
644 *
645 * This method is useful to calculate how much register space is needed to
646 * store a particular type.
647 */
648 extern "C" int
type_size_vec4(const struct glsl_type * type)649 type_size_vec4(const struct glsl_type *type)
650 {
651 return type_size_xvec4(type, true);
652 }
653
654 /**
655 * Returns the minimum number of dvec4 elements needed to pack a type.
656 *
657 * For simple types, it will return 1 (a single dvec4); for matrices, the
658 * number of columns; for array and struct, the sum of the dvec4_size of
659 * each of its elements; and for sampler and atomic, zero.
660 *
661 * This method is useful to calculate how much register space is needed to
662 * store a particular type.
663 *
664 * Measuring double-precision vertex inputs as dvec4 is required because
665 * ARB_vertex_attrib_64bit states that these uses the same number of locations
666 * than the single-precision version. That is, two consecutives dvec4 would be
667 * located in location "x" and location "x+1", not "x+2".
668 *
669 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
670 * remap_vs_attrs() will take in account both the location and also if the
671 * type fits in one or two vec4 slots.
672 */
673 extern "C" int
type_size_dvec4(const struct glsl_type * type)674 type_size_dvec4(const struct glsl_type *type)
675 {
676 return type_size_xvec4(type, false);
677 }
678
src_reg(class vec4_visitor * v,const struct glsl_type * type)679 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
680 {
681 init();
682
683 this->file = VGRF;
684 this->nr = v->alloc.allocate(type_size_vec4(type));
685
686 if (type->is_array() || type->is_record()) {
687 this->swizzle = BRW_SWIZZLE_NOOP;
688 } else {
689 this->swizzle = brw_swizzle_for_size(type->vector_elements);
690 }
691
692 this->type = brw_type_for_base_type(type);
693 }
694
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)695 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
696 {
697 assert(size > 0);
698
699 init();
700
701 this->file = VGRF;
702 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
703
704 this->swizzle = BRW_SWIZZLE_NOOP;
705
706 this->type = brw_type_for_base_type(type);
707 }
708
dst_reg(class vec4_visitor * v,const struct glsl_type * type)709 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
710 {
711 init();
712
713 this->file = VGRF;
714 this->nr = v->alloc.allocate(type_size_vec4(type));
715
716 if (type->is_array() || type->is_record()) {
717 this->writemask = WRITEMASK_XYZW;
718 } else {
719 this->writemask = (1 << type->vector_elements) - 1;
720 }
721
722 this->type = brw_type_for_base_type(type);
723 }
724
725 vec4_instruction *
emit_minmax(enum brw_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)726 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
727 src_reg src0, src_reg src1)
728 {
729 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
730 inst->conditional_mod = conditionalmod;
731 return inst;
732 }
733
734 vec4_instruction *
emit_lrp(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)735 vec4_visitor::emit_lrp(const dst_reg &dst,
736 const src_reg &x, const src_reg &y, const src_reg &a)
737 {
738 if (devinfo->gen >= 6) {
739 /* Note that the instruction's argument order is reversed from GLSL
740 * and the IR.
741 */
742 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
743 fix_3src_operand(x)));
744 } else {
745 /* Earlier generations don't support three source operations, so we
746 * need to emit x*(1-a) + y*a.
747 */
748 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
749 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
750 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
751 y_times_a.writemask = dst.writemask;
752 one_minus_a.writemask = dst.writemask;
753 x_times_one_minus_a.writemask = dst.writemask;
754
755 emit(MUL(y_times_a, y, a));
756 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
757 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
758 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
759 }
760 }
761
762 /**
763 * Emits the instructions needed to perform a pull constant load. before_block
764 * and before_inst can be NULL in which case the instruction will be appended
765 * to the end of the instruction list.
766 */
767 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,bblock_t * before_block,vec4_instruction * before_inst)768 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
769 src_reg surf_index,
770 src_reg offset_reg,
771 bblock_t *before_block,
772 vec4_instruction *before_inst)
773 {
774 assert((before_inst == NULL && before_block == NULL) ||
775 (before_inst && before_block));
776
777 vec4_instruction *pull;
778
779 if (devinfo->gen >= 9) {
780 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
781 src_reg header(this, glsl_type::uvec4_type, 2);
782
783 pull = new(mem_ctx)
784 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
785 dst_reg(header));
786
787 if (before_inst)
788 emit_before(before_block, before_inst, pull);
789 else
790 emit(pull);
791
792 dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
793 offset_reg.type);
794 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
795
796 if (before_inst)
797 emit_before(before_block, before_inst, pull);
798 else
799 emit(pull);
800
801 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
802 dst,
803 surf_index,
804 header);
805 pull->mlen = 2;
806 pull->header_size = 1;
807 } else if (devinfo->gen >= 7) {
808 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
809
810 grf_offset.type = offset_reg.type;
811
812 pull = MOV(grf_offset, offset_reg);
813
814 if (before_inst)
815 emit_before(before_block, before_inst, pull);
816 else
817 emit(pull);
818
819 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
820 dst,
821 surf_index,
822 src_reg(grf_offset));
823 pull->mlen = 1;
824 } else {
825 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
826 dst,
827 surf_index,
828 offset_reg);
829 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
830 pull->mlen = 1;
831 }
832
833 if (before_inst)
834 emit_before(before_block, before_inst, pull);
835 else
836 emit(pull);
837 }
838
839 src_reg
emit_uniformize(const src_reg & src)840 vec4_visitor::emit_uniformize(const src_reg &src)
841 {
842 const src_reg chan_index(this, glsl_type::uint_type);
843 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
844 src.type);
845
846 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
847 ->force_writemask_all = true;
848 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
849 ->force_writemask_all = true;
850
851 return src_reg(dst);
852 }
853
854 src_reg
emit_mcs_fetch(const glsl_type * coordinate_type,src_reg coordinate,src_reg surface)855 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
856 src_reg coordinate, src_reg surface)
857 {
858 vec4_instruction *inst =
859 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
860 dst_reg(this, glsl_type::uvec4_type));
861 inst->base_mrf = 2;
862 inst->src[1] = surface;
863 inst->src[2] = surface;
864
865 int param_base;
866
867 if (devinfo->gen >= 9) {
868 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
869 vec4_instruction *header_inst = new(mem_ctx)
870 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
871 dst_reg(MRF, inst->base_mrf));
872
873 emit(header_inst);
874
875 inst->mlen = 2;
876 inst->header_size = 1;
877 param_base = inst->base_mrf + 1;
878 } else {
879 inst->mlen = 1;
880 param_base = inst->base_mrf;
881 }
882
883 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
884 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
885 int zero_mask = 0xf & ~coord_mask;
886
887 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
888 coordinate));
889
890 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
891 brw_imm_d(0)));
892
893 emit(inst);
894 return src_reg(inst->dst);
895 }
896
897 bool
is_high_sampler(src_reg sampler)898 vec4_visitor::is_high_sampler(src_reg sampler)
899 {
900 if (devinfo->gen < 8 && !devinfo->is_haswell)
901 return false;
902
903 return sampler.file != IMM || sampler.ud >= 16;
904 }
905
906 void
emit_texture(ir_texture_opcode op,dst_reg dest,const glsl_type * dest_type,src_reg coordinate,int coord_components,src_reg shadow_comparator,src_reg lod,src_reg lod2,src_reg sample_index,uint32_t constant_offset,src_reg offset_value,src_reg mcs,uint32_t surface,src_reg surface_reg,src_reg sampler_reg)907 vec4_visitor::emit_texture(ir_texture_opcode op,
908 dst_reg dest,
909 const glsl_type *dest_type,
910 src_reg coordinate,
911 int coord_components,
912 src_reg shadow_comparator,
913 src_reg lod, src_reg lod2,
914 src_reg sample_index,
915 uint32_t constant_offset,
916 src_reg offset_value,
917 src_reg mcs,
918 uint32_t surface,
919 src_reg surface_reg,
920 src_reg sampler_reg)
921 {
922 enum opcode opcode;
923 switch (op) {
924 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
925 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
926 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
927 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
928 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
929 SHADER_OPCODE_TXF_CMS); break;
930 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
931 case ir_tg4: opcode = offset_value.file != BAD_FILE
932 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
933 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
934 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
935 case ir_txb:
936 unreachable("TXB is not valid for vertex shaders.");
937 case ir_lod:
938 unreachable("LOD is not valid for vertex shaders.");
939 case ir_samples_identical: {
940 /* There are some challenges implementing this for vec4, and it seems
941 * unlikely to be used anyway. For now, just return false ways.
942 */
943 emit(MOV(dest, brw_imm_ud(0u)));
944 return;
945 }
946 default:
947 unreachable("Unrecognized tex op");
948 }
949
950 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
951
952 inst->offset = constant_offset;
953
954 /* The message header is necessary for:
955 * - Gen4 (always)
956 * - Gen9+ for selecting SIMD4x2
957 * - Texel offsets
958 * - Gather channel selection
959 * - Sampler indices too large to fit in a 4-bit value.
960 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
961 */
962 inst->header_size =
963 (devinfo->gen < 5 || devinfo->gen >= 9 ||
964 inst->offset != 0 || op == ir_tg4 ||
965 op == ir_texture_samples ||
966 is_high_sampler(sampler_reg)) ? 1 : 0;
967 inst->base_mrf = 2;
968 inst->mlen = inst->header_size;
969 inst->dst.writemask = WRITEMASK_XYZW;
970 inst->shadow_compare = shadow_comparator.file != BAD_FILE;
971
972 inst->src[1] = surface_reg;
973 inst->src[2] = sampler_reg;
974
975 /* MRF for the first parameter */
976 int param_base = inst->base_mrf + inst->header_size;
977
978 if (op == ir_txs || op == ir_query_levels) {
979 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
980 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
981 inst->mlen++;
982 } else if (op == ir_texture_samples) {
983 inst->dst.writemask = WRITEMASK_X;
984 } else {
985 /* Load the coordinate */
986 /* FINISHME: gl_clamp_mask and saturate */
987 int coord_mask = (1 << coord_components) - 1;
988 int zero_mask = 0xf & ~coord_mask;
989
990 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
991 coordinate));
992 inst->mlen++;
993
994 if (zero_mask != 0) {
995 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
996 brw_imm_d(0)));
997 }
998 /* Load the shadow comparator */
999 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1000 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1001 WRITEMASK_X),
1002 shadow_comparator));
1003 inst->mlen++;
1004 }
1005
1006 /* Load the LOD info */
1007 if (op == ir_tex || op == ir_txl) {
1008 int mrf, writemask;
1009 if (devinfo->gen >= 5) {
1010 mrf = param_base + 1;
1011 if (shadow_comparator.file != BAD_FILE) {
1012 writemask = WRITEMASK_Y;
1013 /* mlen already incremented */
1014 } else {
1015 writemask = WRITEMASK_X;
1016 inst->mlen++;
1017 }
1018 } else /* devinfo->gen == 4 */ {
1019 mrf = param_base;
1020 writemask = WRITEMASK_W;
1021 }
1022 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1023 } else if (op == ir_txf) {
1024 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1025 } else if (op == ir_txf_ms) {
1026 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1027 sample_index));
1028 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1029 /* MCS data is stored in the first two channels of ‘mcs’, but we
1030 * need to get it into the .y and .z channels of the second vec4
1031 * of params.
1032 */
1033 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1034 emit(MOV(dst_reg(MRF, param_base + 1,
1035 glsl_type::uint_type, WRITEMASK_YZ),
1036 mcs));
1037 } else if (devinfo->gen >= 7) {
1038 /* MCS data is in the first channel of `mcs`, but we need to get it into
1039 * the .y channel of the second vec4 of params, so replicate .x across
1040 * the whole vec4 and then mask off everything except .y
1041 */
1042 mcs.swizzle = BRW_SWIZZLE_XXXX;
1043 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1044 mcs));
1045 }
1046 inst->mlen++;
1047 } else if (op == ir_txd) {
1048 const brw_reg_type type = lod.type;
1049
1050 if (devinfo->gen >= 5) {
1051 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1052 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1053 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1054 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1055 inst->mlen++;
1056
1057 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1058 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1059 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1060 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1061 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1062 inst->mlen++;
1063
1064 if (shadow_comparator.file != BAD_FILE) {
1065 emit(MOV(dst_reg(MRF, param_base + 2,
1066 shadow_comparator.type, WRITEMASK_Z),
1067 shadow_comparator));
1068 }
1069 }
1070 } else /* devinfo->gen == 4 */ {
1071 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1072 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1073 inst->mlen += 2;
1074 }
1075 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1076 if (shadow_comparator.file != BAD_FILE) {
1077 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1078 shadow_comparator));
1079 }
1080
1081 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1082 offset_value));
1083 inst->mlen++;
1084 }
1085 }
1086
1087 emit(inst);
1088
1089 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1090 * spec requires layers.
1091 */
1092 if (op == ir_txs && devinfo->gen < 7) {
1093 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1094 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1095 src_reg(inst->dst), brw_imm_d(1));
1096 }
1097
1098 if (devinfo->gen == 6 && op == ir_tg4) {
1099 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1100 }
1101
1102 if (op == ir_query_levels) {
1103 /* # levels is in .w */
1104 src_reg swizzled(dest);
1105 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1106 SWIZZLE_W, SWIZZLE_W);
1107 emit(MOV(dest, swizzled));
1108 }
1109 }
1110
1111 /**
1112 * Apply workarounds for Gen6 gather with UINT/SINT
1113 */
1114 void
emit_gen6_gather_wa(uint8_t wa,dst_reg dst)1115 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1116 {
1117 if (!wa)
1118 return;
1119
1120 int width = (wa & WA_8BIT) ? 8 : 16;
1121 dst_reg dst_f = dst;
1122 dst_f.type = BRW_REGISTER_TYPE_F;
1123
1124 /* Convert from UNORM to UINT */
1125 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1126 emit(MOV(dst, src_reg(dst_f)));
1127
1128 if (wa & WA_SIGN) {
1129 /* Reinterpret the UINT value as a signed INT value by
1130 * shifting the sign bit into place, then shifting back
1131 * preserving sign.
1132 */
1133 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1134 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1135 }
1136 }
1137
1138 void
gs_emit_vertex(int)1139 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1140 {
1141 unreachable("not reached");
1142 }
1143
1144 void
gs_end_primitive()1145 vec4_visitor::gs_end_primitive()
1146 {
1147 unreachable("not reached");
1148 }
1149
1150 void
emit_ndc_computation()1151 vec4_visitor::emit_ndc_computation()
1152 {
1153 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1154 return;
1155
1156 /* Get the position */
1157 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1158
1159 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1160 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1161 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1162 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1163
1164 current_annotation = "NDC";
1165 dst_reg ndc_w = ndc;
1166 ndc_w.writemask = WRITEMASK_W;
1167 src_reg pos_w = pos;
1168 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1169 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1170
1171 dst_reg ndc_xyz = ndc;
1172 ndc_xyz.writemask = WRITEMASK_XYZ;
1173
1174 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1175 }
1176
1177 void
emit_psiz_and_flags(dst_reg reg)1178 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1179 {
1180 if (devinfo->gen < 6 &&
1181 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1182 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1183 devinfo->has_negative_rhw_bug)) {
1184 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1185 dst_reg header1_w = header1;
1186 header1_w.writemask = WRITEMASK_W;
1187
1188 emit(MOV(header1, brw_imm_ud(0u)));
1189
1190 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1191 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1192
1193 current_annotation = "Point size";
1194 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1195 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1196 }
1197
1198 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1199 current_annotation = "Clipping flags";
1200 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1201 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1202
1203 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1204 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1205 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1206
1207 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1208 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1209 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1210 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1211 }
1212
1213 /* i965 clipping workaround:
1214 * 1) Test for -ve rhw
1215 * 2) If set,
1216 * set ndc = (0,0,0,0)
1217 * set ucp[6] = 1
1218 *
1219 * Later, clipping will detect ucp[6] and ensure the primitive is
1220 * clipped against all fixed planes.
1221 */
1222 if (devinfo->has_negative_rhw_bug &&
1223 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1224 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1225 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1226 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1227 vec4_instruction *inst;
1228 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1229 inst->predicate = BRW_PREDICATE_NORMAL;
1230 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1231 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1232 inst->predicate = BRW_PREDICATE_NORMAL;
1233 }
1234
1235 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1236 } else if (devinfo->gen < 6) {
1237 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1238 } else {
1239 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1240 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1241 dst_reg reg_w = reg;
1242 reg_w.writemask = WRITEMASK_W;
1243 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1244 reg_as_src.type = reg_w.type;
1245 reg_as_src.swizzle = brw_swizzle_for_size(1);
1246 emit(MOV(reg_w, reg_as_src));
1247 }
1248 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1249 dst_reg reg_y = reg;
1250 reg_y.writemask = WRITEMASK_Y;
1251 reg_y.type = BRW_REGISTER_TYPE_D;
1252 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1253 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1254 }
1255 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1256 dst_reg reg_z = reg;
1257 reg_z.writemask = WRITEMASK_Z;
1258 reg_z.type = BRW_REGISTER_TYPE_D;
1259 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1260 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1261 }
1262 }
1263 }
1264
1265 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)1266 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1267 {
1268 assert(varying < VARYING_SLOT_MAX);
1269
1270 unsigned num_comps = output_num_components[varying][component];
1271 if (num_comps == 0)
1272 return NULL;
1273
1274 assert(output_reg[varying][component].type == reg.type);
1275 current_annotation = output_reg_annotation[varying];
1276 if (output_reg[varying][component].file != BAD_FILE) {
1277 src_reg src = src_reg(output_reg[varying][component]);
1278 src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1279 reg.writemask =
1280 brw_writemask_for_component_packing(num_comps, component);
1281 return emit(MOV(reg, src));
1282 }
1283 return NULL;
1284 }
1285
1286 void
emit_urb_slot(dst_reg reg,int varying)1287 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1288 {
1289 reg.type = BRW_REGISTER_TYPE_F;
1290 output_reg[varying][0].type = reg.type;
1291
1292 switch (varying) {
1293 case VARYING_SLOT_PSIZ:
1294 {
1295 /* PSIZ is always in slot 0, and is coupled with other flags. */
1296 current_annotation = "indices, point width, clip flags";
1297 emit_psiz_and_flags(reg);
1298 break;
1299 }
1300 case BRW_VARYING_SLOT_NDC:
1301 current_annotation = "NDC";
1302 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1303 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1304 break;
1305 case VARYING_SLOT_POS:
1306 current_annotation = "gl_Position";
1307 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1308 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1309 break;
1310 case VARYING_SLOT_EDGE: {
1311 /* This is present when doing unfilled polygons. We're supposed to copy
1312 * the edge flag from the user-provided vertex array
1313 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1314 * of that attribute (starts as 1.0f). This is then used in clipping to
1315 * determine which edges should be drawn as wireframe.
1316 */
1317 current_annotation = "edge flag";
1318 int edge_attr = _mesa_bitcount_64(nir->info.inputs_read &
1319 BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1320 emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1321 glsl_type::float_type, WRITEMASK_XYZW))));
1322 break;
1323 }
1324 case BRW_VARYING_SLOT_PAD:
1325 /* No need to write to this slot */
1326 break;
1327 default:
1328 for (int i = 0; i < 4; i++) {
1329 emit_generic_urb_slot(reg, varying, i);
1330 }
1331 break;
1332 }
1333 }
1334
1335 static int
align_interleaved_urb_mlen(const struct gen_device_info * devinfo,int mlen)1336 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1337 {
1338 if (devinfo->gen >= 6) {
1339 /* URB data written (does not include the message header reg) must
1340 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1341 * section 5.4.3.2.2: URB_INTERLEAVED.
1342 *
1343 * URB entries are allocated on a multiple of 1024 bits, so an
1344 * extra 128 bits written here to make the end align to 256 is
1345 * no problem.
1346 */
1347 if ((mlen % 2) != 1)
1348 mlen++;
1349 }
1350
1351 return mlen;
1352 }
1353
1354
1355 /**
1356 * Generates the VUE payload plus the necessary URB write instructions to
1357 * output it.
1358 *
1359 * The VUE layout is documented in Volume 2a.
1360 */
1361 void
emit_vertex()1362 vec4_visitor::emit_vertex()
1363 {
1364 /* MRF 0 is reserved for the debugger, so start with message header
1365 * in MRF 1.
1366 */
1367 int base_mrf = 1;
1368 int mrf = base_mrf;
1369 /* In the process of generating our URB write message contents, we
1370 * may need to unspill a register or load from an array. Those
1371 * reads would use MRFs 14-15.
1372 */
1373 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1374
1375 /* The following assertion verifies that max_usable_mrf causes an
1376 * even-numbered amount of URB write data, which will meet gen6's
1377 * requirements for length alignment.
1378 */
1379 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1380
1381 /* First mrf is the g0-based message header containing URB handles and
1382 * such.
1383 */
1384 emit_urb_write_header(mrf++);
1385
1386 if (devinfo->gen < 6) {
1387 emit_ndc_computation();
1388 }
1389
1390 /* We may need to split this up into several URB writes, so do them in a
1391 * loop.
1392 */
1393 int slot = 0;
1394 bool complete = false;
1395 do {
1396 /* URB offset is in URB row increments, and each of our MRFs is half of
1397 * one of those, since we're doing interleaved writes.
1398 */
1399 int offset = slot / 2;
1400
1401 mrf = base_mrf + 1;
1402 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1403 emit_urb_slot(dst_reg(MRF, mrf++),
1404 prog_data->vue_map.slot_to_varying[slot]);
1405
1406 /* If this was max_usable_mrf, we can't fit anything more into this
1407 * URB WRITE. Same thing if we reached the maximum length available.
1408 */
1409 if (mrf > max_usable_mrf ||
1410 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1411 slot++;
1412 break;
1413 }
1414 }
1415
1416 complete = slot >= prog_data->vue_map.num_slots;
1417 current_annotation = "URB write";
1418 vec4_instruction *inst = emit_urb_write_opcode(complete);
1419 inst->base_mrf = base_mrf;
1420 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1421 inst->offset += offset;
1422 } while(!complete);
1423 }
1424
1425
1426 src_reg
get_scratch_offset(bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1427 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1428 src_reg *reladdr, int reg_offset)
1429 {
1430 /* Because we store the values to scratch interleaved like our
1431 * vertex data, we need to scale the vec4 index by 2.
1432 */
1433 int message_header_scale = 2;
1434
1435 /* Pre-gen6, the message header uses byte offsets instead of vec4
1436 * (16-byte) offset units.
1437 */
1438 if (devinfo->gen < 6)
1439 message_header_scale *= 16;
1440
1441 if (reladdr) {
1442 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1443 * to multiply the reladdr by 2. Notice that the reg_offset part
1444 * is in units of 16 bytes and is used to select the low/high 16-byte
1445 * chunk of a full dvec4, so we don't want to multiply that part.
1446 */
1447 src_reg index = src_reg(this, glsl_type::int_type);
1448 if (type_sz(inst->dst.type) < 8) {
1449 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1450 brw_imm_d(reg_offset)));
1451 emit_before(block, inst, MUL(dst_reg(index), index,
1452 brw_imm_d(message_header_scale)));
1453 } else {
1454 emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1455 brw_imm_d(message_header_scale * 2)));
1456 emit_before(block, inst, ADD(dst_reg(index), index,
1457 brw_imm_d(reg_offset * message_header_scale)));
1458 }
1459 return index;
1460 } else {
1461 return brw_imm_d(reg_offset * message_header_scale);
1462 }
1463 }
1464
1465 /**
1466 * Emits an instruction before @inst to load the value named by @orig_src
1467 * from scratch space at @base_offset to @temp.
1468 *
1469 * @base_offset is measured in 32-byte units (the size of a register).
1470 */
1471 void
emit_scratch_read(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1472 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1473 dst_reg temp, src_reg orig_src,
1474 int base_offset)
1475 {
1476 assert(orig_src.offset % REG_SIZE == 0);
1477 int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1478 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1479 reg_offset);
1480
1481 if (type_sz(orig_src.type) < 8) {
1482 emit_before(block, inst, SCRATCH_READ(temp, index));
1483 } else {
1484 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1485 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1486 emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1487 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1488 vec4_instruction *last_read =
1489 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1490 emit_before(block, inst, last_read);
1491 shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1492 }
1493 }
1494
1495 /**
1496 * Emits an instruction after @inst to store the value to be written
1497 * to @orig_dst to scratch space at @base_offset, from @temp.
1498 *
1499 * @base_offset is measured in 32-byte units (the size of a register).
1500 */
1501 void
emit_scratch_write(bblock_t * block,vec4_instruction * inst,int base_offset)1502 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1503 int base_offset)
1504 {
1505 assert(inst->dst.offset % REG_SIZE == 0);
1506 int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1507 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1508 reg_offset);
1509
1510 /* Create a temporary register to store *inst's result in.
1511 *
1512 * We have to be careful in MOVing from our temporary result register in
1513 * the scratch write. If we swizzle from channels of the temporary that
1514 * weren't initialized, it will confuse live interval analysis, which will
1515 * make spilling fail to make progress.
1516 */
1517 bool is_64bit = type_sz(inst->dst.type) == 8;
1518 const glsl_type *alloc_type =
1519 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1520 const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1521 inst->dst.type),
1522 brw_swizzle_for_mask(inst->dst.writemask));
1523
1524 if (!is_64bit) {
1525 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1526 inst->dst.writemask));
1527 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1528 if (inst->opcode != BRW_OPCODE_SEL)
1529 write->predicate = inst->predicate;
1530 write->ir = inst->ir;
1531 write->annotation = inst->annotation;
1532 inst->insert_after(block, write);
1533 } else {
1534 dst_reg shuffled = dst_reg(this, alloc_type);
1535 vec4_instruction *last =
1536 shuffle_64bit_data(shuffled, temp, true, block, inst);
1537 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1538
1539 uint8_t mask = 0;
1540 if (inst->dst.writemask & WRITEMASK_X)
1541 mask |= WRITEMASK_XY;
1542 if (inst->dst.writemask & WRITEMASK_Y)
1543 mask |= WRITEMASK_ZW;
1544 if (mask) {
1545 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1546
1547 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1548 if (inst->opcode != BRW_OPCODE_SEL)
1549 write->predicate = inst->predicate;
1550 write->ir = inst->ir;
1551 write->annotation = inst->annotation;
1552 last->insert_after(block, write);
1553 }
1554
1555 mask = 0;
1556 if (inst->dst.writemask & WRITEMASK_Z)
1557 mask |= WRITEMASK_XY;
1558 if (inst->dst.writemask & WRITEMASK_W)
1559 mask |= WRITEMASK_ZW;
1560 if (mask) {
1561 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1562
1563 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1564 reg_offset + 1);
1565 vec4_instruction *write =
1566 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1567 if (inst->opcode != BRW_OPCODE_SEL)
1568 write->predicate = inst->predicate;
1569 write->ir = inst->ir;
1570 write->annotation = inst->annotation;
1571 last->insert_after(block, write);
1572 }
1573 }
1574
1575 inst->dst.file = temp.file;
1576 inst->dst.nr = temp.nr;
1577 inst->dst.offset %= REG_SIZE;
1578 inst->dst.reladdr = NULL;
1579 }
1580
1581 /**
1582 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1583 * adds the scratch read(s) before \p inst. The function also checks for
1584 * recursive reladdr scratch accesses, issuing the corresponding scratch
1585 * loads and rewriting reladdr references accordingly.
1586 *
1587 * \return \p src if it did not require a scratch load, otherwise, the
1588 * register holding the result of the scratch load that the caller should
1589 * use to rewrite src.
1590 */
1591 src_reg
emit_resolve_reladdr(int scratch_loc[],bblock_t * block,vec4_instruction * inst,src_reg src)1592 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1593 vec4_instruction *inst, src_reg src)
1594 {
1595 /* Resolve recursive reladdr scratch access by calling ourselves
1596 * with src.reladdr
1597 */
1598 if (src.reladdr)
1599 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1600 *src.reladdr);
1601
1602 /* Now handle scratch access on src */
1603 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1604 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1605 glsl_type::dvec4_type : glsl_type::vec4_type);
1606 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1607 src.nr = temp.nr;
1608 src.offset %= REG_SIZE;
1609 src.reladdr = NULL;
1610 }
1611
1612 return src;
1613 }
1614
1615 /**
1616 * We can't generally support array access in GRF space, because a
1617 * single instruction's destination can only span 2 contiguous
1618 * registers. So, we send all GRF arrays that get variable index
1619 * access to scratch space.
1620 */
1621 void
move_grf_array_access_to_scratch()1622 vec4_visitor::move_grf_array_access_to_scratch()
1623 {
1624 int scratch_loc[this->alloc.count];
1625 memset(scratch_loc, -1, sizeof(scratch_loc));
1626
1627 /* First, calculate the set of virtual GRFs that need to be punted
1628 * to scratch due to having any array access on them, and where in
1629 * scratch.
1630 */
1631 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1632 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1633 if (scratch_loc[inst->dst.nr] == -1) {
1634 scratch_loc[inst->dst.nr] = last_scratch;
1635 last_scratch += this->alloc.sizes[inst->dst.nr];
1636 }
1637
1638 for (src_reg *iter = inst->dst.reladdr;
1639 iter->reladdr;
1640 iter = iter->reladdr) {
1641 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1642 scratch_loc[iter->nr] = last_scratch;
1643 last_scratch += this->alloc.sizes[iter->nr];
1644 }
1645 }
1646 }
1647
1648 for (int i = 0 ; i < 3; i++) {
1649 for (src_reg *iter = &inst->src[i];
1650 iter->reladdr;
1651 iter = iter->reladdr) {
1652 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1653 scratch_loc[iter->nr] = last_scratch;
1654 last_scratch += this->alloc.sizes[iter->nr];
1655 }
1656 }
1657 }
1658 }
1659
1660 /* Now, for anything that will be accessed through scratch, rewrite
1661 * it to load/store. Note that this is a _safe list walk, because
1662 * we may generate a new scratch_write instruction after the one
1663 * we're processing.
1664 */
1665 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1666 /* Set up the annotation tracking for new generated instructions. */
1667 base_ir = inst->ir;
1668 current_annotation = inst->annotation;
1669
1670 /* First handle scratch access on the dst. Notice we have to handle
1671 * the case where the dst's reladdr also points to scratch space.
1672 */
1673 if (inst->dst.reladdr)
1674 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1675 *inst->dst.reladdr);
1676
1677 /* Now that we have handled any (possibly recursive) reladdr scratch
1678 * accesses for dst we can safely do the scratch write for dst itself
1679 */
1680 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1681 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1682
1683 /* Now handle scratch access on any src. In this case, since inst->src[i]
1684 * already is a src_reg, we can just call emit_resolve_reladdr with
1685 * inst->src[i] and it will take care of handling scratch loads for
1686 * both src and src.reladdr (recursively).
1687 */
1688 for (int i = 0 ; i < 3; i++) {
1689 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1690 inst->src[i]);
1691 }
1692 }
1693 }
1694
1695 /**
1696 * Emits an instruction before @inst to load the value named by @orig_src
1697 * from the pull constant buffer (surface) at @base_offset to @temp.
1698 */
1699 void
emit_pull_constant_load(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset,src_reg indirect)1700 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1701 dst_reg temp, src_reg orig_src,
1702 int base_offset, src_reg indirect)
1703 {
1704 assert(orig_src.offset % 16 == 0);
1705 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1706
1707 /* For 64bit loads we need to emit two 32-bit load messages and we also
1708 * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1709 * that we emit the 32-bit loads into a temporary and we shuffle the result
1710 * into the original destination.
1711 */
1712 dst_reg orig_temp = temp;
1713 bool is_64bit = type_sz(orig_src.type) == 8;
1714 if (is_64bit) {
1715 assert(type_sz(temp.type) == 8);
1716 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1717 temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1718 }
1719
1720 src_reg src = orig_src;
1721 for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1722 int reg_offset = base_offset + src.offset / 16;
1723
1724 src_reg offset;
1725 if (indirect.file != BAD_FILE) {
1726 offset = src_reg(this, glsl_type::uint_type);
1727 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1728 brw_imm_ud(reg_offset * 16)));
1729 } else if (devinfo->gen >= 8) {
1730 /* Store the offset in a GRF so we can send-from-GRF. */
1731 offset = src_reg(this, glsl_type::uint_type);
1732 emit_before(block, inst, MOV(dst_reg(offset),
1733 brw_imm_ud(reg_offset * 16)));
1734 } else {
1735 offset = brw_imm_d(reg_offset * 16);
1736 }
1737
1738 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1739 brw_imm_ud(index),
1740 offset,
1741 block, inst);
1742
1743 src = byte_offset(src, 16);
1744 }
1745
1746 brw_mark_surface_used(&prog_data->base, index);
1747
1748 if (is_64bit) {
1749 temp = retype(temp, BRW_REGISTER_TYPE_DF);
1750 shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1751 }
1752 }
1753
1754 /**
1755 * Implements array access of uniforms by inserting a
1756 * PULL_CONSTANT_LOAD instruction.
1757 *
1758 * Unlike temporary GRF array access (where we don't support it due to
1759 * the difficulty of doing relative addressing on instruction
1760 * destinations), we could potentially do array access of uniforms
1761 * that were loaded in GRF space as push constants. In real-world
1762 * usage we've seen, though, the arrays being used are always larger
1763 * than we could load as push constants, so just always move all
1764 * uniform array access out to a pull constant buffer.
1765 */
1766 void
move_uniform_array_access_to_pull_constants()1767 vec4_visitor::move_uniform_array_access_to_pull_constants()
1768 {
1769 /* The vulkan dirver doesn't support pull constants other than UBOs so
1770 * everything has to be pushed regardless.
1771 */
1772 if (!compiler->supports_pull_constants) {
1773 split_uniform_registers();
1774 return;
1775 }
1776
1777 /* Allocate the pull_params array */
1778 assert(stage_prog_data->nr_pull_params == 0);
1779 stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1780 this->uniforms * 4);
1781
1782 int pull_constant_loc[this->uniforms];
1783 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1784
1785 /* First, walk through the instructions and determine which things need to
1786 * be pulled. We mark something as needing to be pulled by setting
1787 * pull_constant_loc to 0.
1788 */
1789 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1790 /* We only care about MOV_INDIRECT of a uniform */
1791 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1792 inst->src[0].file != UNIFORM)
1793 continue;
1794
1795 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1796
1797 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1798 pull_constant_loc[uniform_nr + j] = 0;
1799 }
1800
1801 /* Next, we walk the list of uniforms and assign real pull constant
1802 * locations and set their corresponding entries in pull_param.
1803 */
1804 for (int j = 0; j < this->uniforms; j++) {
1805 if (pull_constant_loc[j] < 0)
1806 continue;
1807
1808 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1809
1810 for (int i = 0; i < 4; i++) {
1811 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1812 = stage_prog_data->param[j * 4 + i];
1813 }
1814 }
1815
1816 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1817 * instructions to actual uniform pulls.
1818 */
1819 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1820 /* We only care about MOV_INDIRECT of a uniform */
1821 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1822 inst->src[0].file != UNIFORM)
1823 continue;
1824
1825 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1826
1827 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1828
1829 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1830 pull_constant_loc[uniform_nr], inst->src[1]);
1831 inst->remove(block);
1832 }
1833
1834 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1835 * no need to track them as larger-than-vec4 objects. This will be
1836 * relied on in cutting out unused uniform vectors from push
1837 * constants.
1838 */
1839 split_uniform_registers();
1840 }
1841
1842 void
resolve_ud_negate(src_reg * reg)1843 vec4_visitor::resolve_ud_negate(src_reg *reg)
1844 {
1845 if (reg->type != BRW_REGISTER_TYPE_UD ||
1846 !reg->negate)
1847 return;
1848
1849 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1850 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1851 *reg = temp;
1852 }
1853
vec4_visitor(const struct brw_compiler * compiler,void * log_data,const struct brw_sampler_prog_key_data * key_tex,struct brw_vue_prog_data * prog_data,const nir_shader * shader,void * mem_ctx,bool no_spills,int shader_time_index)1854 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1855 void *log_data,
1856 const struct brw_sampler_prog_key_data *key_tex,
1857 struct brw_vue_prog_data *prog_data,
1858 const nir_shader *shader,
1859 void *mem_ctx,
1860 bool no_spills,
1861 int shader_time_index)
1862 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1863 key_tex(key_tex),
1864 prog_data(prog_data),
1865 fail_msg(NULL),
1866 first_non_payload_grf(0),
1867 need_all_constants_in_pull_buffer(false),
1868 no_spills(no_spills),
1869 shader_time_index(shader_time_index),
1870 last_scratch(0)
1871 {
1872 this->failed = false;
1873
1874 this->base_ir = NULL;
1875 this->current_annotation = NULL;
1876 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1877
1878 memset(this->output_num_components, 0, sizeof(this->output_num_components));
1879
1880 this->virtual_grf_start = NULL;
1881 this->virtual_grf_end = NULL;
1882 this->live_intervals = NULL;
1883
1884 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1885
1886 this->uniforms = 0;
1887 }
1888
1889
1890 void
fail(const char * format,...)1891 vec4_visitor::fail(const char *format, ...)
1892 {
1893 va_list va;
1894 char *msg;
1895
1896 if (failed)
1897 return;
1898
1899 failed = true;
1900
1901 va_start(va, format);
1902 msg = ralloc_vasprintf(mem_ctx, format, va);
1903 va_end(va);
1904 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1905
1906 this->fail_msg = msg;
1907
1908 if (debug_enabled) {
1909 fprintf(stderr, "%s", msg);
1910 }
1911 }
1912
1913 } /* namespace brw */
1914