1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "brw_program.h"
28
29 namespace brw {
30
vec4_instruction(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = NULL;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_size = 0;
53 this->flag_subreg = 0;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->exec_size = 8;
58 this->group = 0;
59 this->size_written = (dst.file == BAD_FILE ?
60 0 : this->exec_size * type_sz(dst.type));
61 this->annotation = NULL;
62 }
63
64 vec4_instruction *
emit(vec4_instruction * inst)65 vec4_visitor::emit(vec4_instruction *inst)
66 {
67 inst->ir = this->base_ir;
68 inst->annotation = this->current_annotation;
69
70 this->instructions.push_tail(inst);
71
72 return inst;
73 }
74
75 vec4_instruction *
emit_before(bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)76 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
77 vec4_instruction *new_inst)
78 {
79 new_inst->ir = inst->ir;
80 new_inst->annotation = inst->annotation;
81
82 inst->insert_before(block, new_inst);
83
84 return inst;
85 }
86
87 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
89 const src_reg &src1, const src_reg &src2)
90 {
91 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
92 }
93
94
95 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)96 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
97 const src_reg &src1)
98 {
99 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
100 }
101
102 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
104 {
105 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
106 }
107
108 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst)109 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
110 {
111 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
112 }
113
114 vec4_instruction *
emit(enum opcode opcode)115 vec4_visitor::emit(enum opcode opcode)
116 {
117 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
118 }
119
120 #define ALU1(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
123 { \
124 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
125 }
126
127 #define ALU2(op) \
128 vec4_instruction * \
129 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
130 const src_reg &src1) \
131 { \
132 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
133 src0, src1); \
134 }
135
136 #define ALU2_ACC(op) \
137 vec4_instruction * \
138 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
139 const src_reg &src1) \
140 { \
141 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
142 BRW_OPCODE_##op, dst, src0, src1); \
143 inst->writes_accumulator = true; \
144 return inst; \
145 }
146
147 #define ALU3(op) \
148 vec4_instruction * \
149 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
150 const src_reg &src1, const src_reg &src2) \
151 { \
152 assert(devinfo->gen >= 6); \
153 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
154 src0, src1, src2); \
155 }
156
157 ALU1(NOT)
ALU1(MOV)158 ALU1(MOV)
159 ALU1(FRC)
160 ALU1(RNDD)
161 ALU1(RNDE)
162 ALU1(RNDZ)
163 ALU1(F32TO16)
164 ALU1(F16TO32)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2_ACC(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(DP3)
172 ALU2(DP4)
173 ALU2(DPH)
174 ALU2(SHL)
175 ALU2(SHR)
176 ALU2(ASR)
177 ALU3(LRP)
178 ALU1(BFREV)
179 ALU3(BFE)
180 ALU2(BFI1)
181 ALU3(BFI2)
182 ALU1(FBH)
183 ALU1(FBL)
184 ALU1(CBIT)
185 ALU3(MAD)
186 ALU2_ACC(ADDC)
187 ALU2_ACC(SUBB)
188 ALU2(MAC)
189 ALU1(DIM)
190
191 /** Gen4 predicated IF. */
192 vec4_instruction *
193 vec4_visitor::IF(enum brw_predicate predicate)
194 {
195 vec4_instruction *inst;
196
197 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
198 inst->predicate = predicate;
199
200 return inst;
201 }
202
203 /** Gen6 IF with embedded comparison. */
204 vec4_instruction *
IF(src_reg src0,src_reg src1,enum brw_conditional_mod condition)205 vec4_visitor::IF(src_reg src0, src_reg src1,
206 enum brw_conditional_mod condition)
207 {
208 assert(devinfo->gen == 6);
209
210 vec4_instruction *inst;
211
212 resolve_ud_negate(&src0);
213 resolve_ud_negate(&src1);
214
215 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
216 src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 /**
223 * CMP: Sets the low bit of the destination channels with the result
224 * of the comparison, while the upper bits are undefined, and updates
225 * the flag register with the packed 16 bits of the result.
226 */
227 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum brw_conditional_mod condition)228 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
229 enum brw_conditional_mod condition)
230 {
231 vec4_instruction *inst;
232
233 /* Take the instruction:
234 *
235 * CMP null<d> src0<f> src1<f>
236 *
237 * Original gen4 does type conversion to the destination type before
238 * comparison, producing garbage results for floating point comparisons.
239 *
240 * The destination type doesn't matter on newer generations, so we set the
241 * type to match src0 so we can compact the instruction.
242 */
243 dst.type = src0.type;
244
245 resolve_ud_negate(&src0);
246 resolve_ud_negate(&src1);
247
248 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
249 inst->conditional_mod = condition;
250
251 return inst;
252 }
253
254 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
256 {
257 vec4_instruction *inst;
258
259 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
260 dst, index);
261 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
262 inst->mlen = 2;
263
264 return inst;
265 }
266
267 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
269 const src_reg &index)
270 {
271 vec4_instruction *inst;
272
273 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
274 dst, src, index);
275 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
276 inst->mlen = 3;
277
278 return inst;
279 }
280
281 src_reg
fix_3src_operand(const src_reg & src)282 vec4_visitor::fix_3src_operand(const src_reg &src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
304 return src_reg(expanded);
305 }
306
307 src_reg
resolve_source_modifiers(const src_reg & src)308 vec4_visitor::resolve_source_modifiers(const src_reg &src)
309 {
310 if (!src.abs && !src.negate)
311 return src;
312
313 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
314 resolved.type = src.type;
315 emit(MOV(resolved, src));
316
317 return src_reg(resolved);
318 }
319
320 src_reg
fix_math_operand(const src_reg & src)321 vec4_visitor::fix_math_operand(const src_reg &src)
322 {
323 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
324 return src;
325
326 /* The gen6 math instruction ignores the source modifiers --
327 * swizzle, abs, negate, and at least some parts of the register
328 * region description.
329 *
330 * Rather than trying to enumerate all these cases, *always* expand the
331 * operand to a temp GRF for gen6.
332 *
333 * For gen7, keep the operand as-is, except if immediate, which gen7 still
334 * can't use.
335 */
336
337 if (devinfo->gen == 7 && src.file != IMM)
338 return src;
339
340 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
341 expanded.type = src.type;
342 emit(MOV(expanded, src));
343 return src_reg(expanded);
344 }
345
346 vec4_instruction *
emit_math(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)347 vec4_visitor::emit_math(enum opcode opcode,
348 const dst_reg &dst,
349 const src_reg &src0, const src_reg &src1)
350 {
351 vec4_instruction *math =
352 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
353
354 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
355 /* MATH on Gen6 must be align1, so we can't do writemasks. */
356 math->dst = dst_reg(this, glsl_type::vec4_type);
357 math->dst.type = dst.type;
358 math = emit(MOV(dst, src_reg(math->dst)));
359 } else if (devinfo->gen < 6) {
360 math->base_mrf = 1;
361 math->mlen = src1.file == BAD_FILE ? 1 : 2;
362 }
363
364 return math;
365 }
366
367 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)368 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
369 {
370 if (devinfo->gen < 7) {
371 unreachable("ir_unop_pack_half_2x16 should be lowered");
372 }
373
374 assert(dst.type == BRW_REGISTER_TYPE_UD);
375 assert(src0.type == BRW_REGISTER_TYPE_F);
376
377 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
378 *
379 * Because this instruction does not have a 16-bit floating-point type,
380 * the destination data type must be Word (W).
381 *
382 * The destination must be DWord-aligned and specify a horizontal stride
383 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
384 * each destination channel and the upper word is not modified.
385 *
386 * The above restriction implies that the f32to16 instruction must use
387 * align1 mode, because only in align1 mode is it possible to specify
388 * horizontal stride. We choose here to defy the hardware docs and emit
389 * align16 instructions.
390 *
391 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
392 * instructions. I was partially successful in that the code passed all
393 * tests. However, the code was dubiously correct and fragile, and the
394 * tests were not harsh enough to probe that frailty. Not trusting the
395 * code, I chose instead to remain in align16 mode in defiance of the hw
396 * docs).
397 *
398 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
399 * simulator, emitting a f32to16 in align16 mode with UD as destination
400 * data type is safe. The behavior differs from that specified in the PRM
401 * in that the upper word of each destination channel is cleared to 0.
402 */
403
404 dst_reg tmp_dst(this, glsl_type::uvec2_type);
405 src_reg tmp_src(tmp_dst);
406
407 #if 0
408 /* Verify the undocumented behavior on which the following instructions
409 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
410 * then the result of the bit-or instruction below will be incorrect.
411 *
412 * You should inspect the disasm output in order to verify that the MOV is
413 * not optimized away.
414 */
415 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
416 #endif
417
418 /* Give tmp the form below, where "." means untouched.
419 *
420 * w z y x w z y x
421 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
422 *
423 * That the upper word of each write-channel be 0 is required for the
424 * following bit-shift and bit-or instructions to work. Note that this
425 * relies on the undocumented hardware behavior mentioned above.
426 */
427 tmp_dst.writemask = WRITEMASK_XY;
428 emit(F32TO16(tmp_dst, src0));
429
430 /* Give the write-channels of dst the form:
431 * 0xhhhh0000
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
434 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
435
436 /* Finally, give the write-channels of dst the form of packHalf2x16's
437 * output:
438 * 0xhhhhllll
439 */
440 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
441 emit(OR(dst, src_reg(dst), tmp_src));
442 }
443
444 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)445 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
446 {
447 if (devinfo->gen < 7) {
448 unreachable("ir_unop_unpack_half_2x16 should be lowered");
449 }
450
451 assert(dst.type == BRW_REGISTER_TYPE_F);
452 assert(src0.type == BRW_REGISTER_TYPE_UD);
453
454 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
455 *
456 * Because this instruction does not have a 16-bit floating-point type,
457 * the source data type must be Word (W). The destination type must be
458 * F (Float).
459 *
460 * To use W as the source data type, we must adjust horizontal strides,
461 * which is only possible in align1 mode. All my [chadv] attempts at
462 * emitting align1 instructions for unpackHalf2x16 failed to pass the
463 * Piglit tests, so I gave up.
464 *
465 * I've verified that, on gen7 hardware and the simulator, it is safe to
466 * emit f16to32 in align16 mode with UD as source data type.
467 */
468
469 dst_reg tmp_dst(this, glsl_type::uvec2_type);
470 src_reg tmp_src(tmp_dst);
471
472 tmp_dst.writemask = WRITEMASK_X;
473 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
474
475 tmp_dst.writemask = WRITEMASK_Y;
476 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
477
478 dst.writemask = WRITEMASK_XY;
479 emit(F16TO32(dst, tmp_src));
480 }
481
482 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)483 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
484 {
485 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
486 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
487 * is not suitable to generate the shift values, but we can use the packed
488 * vector float and a type-converting MOV.
489 */
490 dst_reg shift(this, glsl_type::uvec4_type);
491 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
492
493 dst_reg shifted(this, glsl_type::uvec4_type);
494 src0.swizzle = BRW_SWIZZLE_XXXX;
495 emit(SHR(shifted, src0, src_reg(shift)));
496
497 shifted.type = BRW_REGISTER_TYPE_UB;
498 dst_reg f(this, glsl_type::vec4_type);
499 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
500
501 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
502 }
503
504 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)505 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
506 {
507 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
508 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
509 * is not suitable to generate the shift values, but we can use the packed
510 * vector float and a type-converting MOV.
511 */
512 dst_reg shift(this, glsl_type::uvec4_type);
513 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
514
515 dst_reg shifted(this, glsl_type::uvec4_type);
516 src0.swizzle = BRW_SWIZZLE_XXXX;
517 emit(SHR(shifted, src0, src_reg(shift)));
518
519 shifted.type = BRW_REGISTER_TYPE_B;
520 dst_reg f(this, glsl_type::vec4_type);
521 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
522
523 dst_reg scaled(this, glsl_type::vec4_type);
524 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
525
526 dst_reg max(this, glsl_type::vec4_type);
527 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
528 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
529 }
530
531 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)532 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
533 {
534 dst_reg saturated(this, glsl_type::vec4_type);
535 vec4_instruction *inst = emit(MOV(saturated, src0));
536 inst->saturate = true;
537
538 dst_reg scaled(this, glsl_type::vec4_type);
539 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
540
541 dst_reg rounded(this, glsl_type::vec4_type);
542 emit(RNDE(rounded, src_reg(scaled)));
543
544 dst_reg u(this, glsl_type::uvec4_type);
545 emit(MOV(u, src_reg(rounded)));
546
547 src_reg bytes(u);
548 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
549 }
550
551 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)552 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
553 {
554 dst_reg max(this, glsl_type::vec4_type);
555 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
556
557 dst_reg min(this, glsl_type::vec4_type);
558 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
559
560 dst_reg scaled(this, glsl_type::vec4_type);
561 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
562
563 dst_reg rounded(this, glsl_type::vec4_type);
564 emit(RNDE(rounded, src_reg(scaled)));
565
566 dst_reg i(this, glsl_type::ivec4_type);
567 emit(MOV(i, src_reg(rounded)));
568
569 src_reg bytes(i);
570 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
571 }
572
573 /*
574 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
575 * false) elements needed to pack a type.
576 */
577 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4)578 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
579 {
580 unsigned int i;
581 int size;
582
583 switch (type->base_type) {
584 case GLSL_TYPE_UINT:
585 case GLSL_TYPE_INT:
586 case GLSL_TYPE_FLOAT:
587 case GLSL_TYPE_BOOL:
588 case GLSL_TYPE_DOUBLE:
589 if (type->is_matrix()) {
590 const glsl_type *col_type = type->column_type();
591 unsigned col_slots =
592 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
593 return type->matrix_columns * col_slots;
594 } else {
595 /* Regardless of size of vector, it gets a vec4. This is bad
596 * packing for things like floats, but otherwise arrays become a
597 * mess. Hopefully a later pass over the code can pack scalars
598 * down if appropriate.
599 */
600 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
601 }
602 case GLSL_TYPE_ARRAY:
603 assert(type->length > 0);
604 return type_size_xvec4(type->fields.array, as_vec4) * type->length;
605 case GLSL_TYPE_STRUCT:
606 size = 0;
607 for (i = 0; i < type->length; i++) {
608 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
609 }
610 return size;
611 case GLSL_TYPE_SUBROUTINE:
612 return 1;
613
614 case GLSL_TYPE_SAMPLER:
615 /* Samplers take up no register space, since they're baked in at
616 * link time.
617 */
618 return 0;
619 case GLSL_TYPE_ATOMIC_UINT:
620 return 0;
621 case GLSL_TYPE_IMAGE:
622 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
623 case GLSL_TYPE_VOID:
624 case GLSL_TYPE_ERROR:
625 case GLSL_TYPE_INTERFACE:
626 case GLSL_TYPE_FUNCTION:
627 unreachable("not reached");
628 }
629
630 return 0;
631 }
632
633 /**
634 * Returns the minimum number of vec4 elements needed to pack a type.
635 *
636 * For simple types, it will return 1 (a single vec4); for matrices, the
637 * number of columns; for array and struct, the sum of the vec4_size of
638 * each of its elements; and for sampler and atomic, zero.
639 *
640 * This method is useful to calculate how much register space is needed to
641 * store a particular type.
642 */
643 extern "C" int
type_size_vec4(const struct glsl_type * type)644 type_size_vec4(const struct glsl_type *type)
645 {
646 return type_size_xvec4(type, true);
647 }
648
649 /**
650 * Returns the minimum number of dvec4 elements needed to pack a type.
651 *
652 * For simple types, it will return 1 (a single dvec4); for matrices, the
653 * number of columns; for array and struct, the sum of the dvec4_size of
654 * each of its elements; and for sampler and atomic, zero.
655 *
656 * This method is useful to calculate how much register space is needed to
657 * store a particular type.
658 *
659 * Measuring double-precision vertex inputs as dvec4 is required because
660 * ARB_vertex_attrib_64bit states that these uses the same number of locations
661 * than the single-precision version. That is, two consecutives dvec4 would be
662 * located in location "x" and location "x+1", not "x+2".
663 *
664 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
665 * remap_vs_attrs() will take in account both the location and also if the
666 * type fits in one or two vec4 slots.
667 */
668 extern "C" int
type_size_dvec4(const struct glsl_type * type)669 type_size_dvec4(const struct glsl_type *type)
670 {
671 return type_size_xvec4(type, false);
672 }
673
src_reg(class vec4_visitor * v,const struct glsl_type * type)674 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
675 {
676 init();
677
678 this->file = VGRF;
679 this->nr = v->alloc.allocate(type_size_vec4(type));
680
681 if (type->is_array() || type->is_record()) {
682 this->swizzle = BRW_SWIZZLE_NOOP;
683 } else {
684 this->swizzle = brw_swizzle_for_size(type->vector_elements);
685 }
686
687 this->type = brw_type_for_base_type(type);
688 }
689
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)690 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
691 {
692 assert(size > 0);
693
694 init();
695
696 this->file = VGRF;
697 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
698
699 this->swizzle = BRW_SWIZZLE_NOOP;
700
701 this->type = brw_type_for_base_type(type);
702 }
703
dst_reg(class vec4_visitor * v,const struct glsl_type * type)704 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
705 {
706 init();
707
708 this->file = VGRF;
709 this->nr = v->alloc.allocate(type_size_vec4(type));
710
711 if (type->is_array() || type->is_record()) {
712 this->writemask = WRITEMASK_XYZW;
713 } else {
714 this->writemask = (1 << type->vector_elements) - 1;
715 }
716
717 this->type = brw_type_for_base_type(type);
718 }
719
720 vec4_instruction *
emit_minmax(enum brw_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)721 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
722 src_reg src0, src_reg src1)
723 {
724 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
725 inst->conditional_mod = conditionalmod;
726 return inst;
727 }
728
729 vec4_instruction *
emit_lrp(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)730 vec4_visitor::emit_lrp(const dst_reg &dst,
731 const src_reg &x, const src_reg &y, const src_reg &a)
732 {
733 if (devinfo->gen >= 6) {
734 /* Note that the instruction's argument order is reversed from GLSL
735 * and the IR.
736 */
737 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
738 fix_3src_operand(x)));
739 } else {
740 /* Earlier generations don't support three source operations, so we
741 * need to emit x*(1-a) + y*a.
742 */
743 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
744 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
745 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
746 y_times_a.writemask = dst.writemask;
747 one_minus_a.writemask = dst.writemask;
748 x_times_one_minus_a.writemask = dst.writemask;
749
750 emit(MUL(y_times_a, y, a));
751 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
752 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
753 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
754 }
755 }
756
757 /**
758 * Emits the instructions needed to perform a pull constant load. before_block
759 * and before_inst can be NULL in which case the instruction will be appended
760 * to the end of the instruction list.
761 */
762 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,bblock_t * before_block,vec4_instruction * before_inst)763 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
764 src_reg surf_index,
765 src_reg offset_reg,
766 bblock_t *before_block,
767 vec4_instruction *before_inst)
768 {
769 assert((before_inst == NULL && before_block == NULL) ||
770 (before_inst && before_block));
771
772 vec4_instruction *pull;
773
774 if (devinfo->gen >= 9) {
775 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
776 src_reg header(this, glsl_type::uvec4_type, 2);
777
778 pull = new(mem_ctx)
779 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
780 dst_reg(header));
781
782 if (before_inst)
783 emit_before(before_block, before_inst, pull);
784 else
785 emit(pull);
786
787 dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
788 offset_reg.type);
789 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
790
791 if (before_inst)
792 emit_before(before_block, before_inst, pull);
793 else
794 emit(pull);
795
796 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
797 dst,
798 surf_index,
799 header);
800 pull->mlen = 2;
801 pull->header_size = 1;
802 } else if (devinfo->gen >= 7) {
803 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
804
805 grf_offset.type = offset_reg.type;
806
807 pull = MOV(grf_offset, offset_reg);
808
809 if (before_inst)
810 emit_before(before_block, before_inst, pull);
811 else
812 emit(pull);
813
814 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
815 dst,
816 surf_index,
817 src_reg(grf_offset));
818 pull->mlen = 1;
819 } else {
820 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
821 dst,
822 surf_index,
823 offset_reg);
824 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
825 pull->mlen = 1;
826 }
827
828 if (before_inst)
829 emit_before(before_block, before_inst, pull);
830 else
831 emit(pull);
832 }
833
834 src_reg
emit_uniformize(const src_reg & src)835 vec4_visitor::emit_uniformize(const src_reg &src)
836 {
837 const src_reg chan_index(this, glsl_type::uint_type);
838 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
839 src.type);
840
841 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
842 ->force_writemask_all = true;
843 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
844 ->force_writemask_all = true;
845
846 return src_reg(dst);
847 }
848
849 src_reg
emit_mcs_fetch(const glsl_type * coordinate_type,src_reg coordinate,src_reg surface)850 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
851 src_reg coordinate, src_reg surface)
852 {
853 vec4_instruction *inst =
854 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
855 dst_reg(this, glsl_type::uvec4_type));
856 inst->base_mrf = 2;
857 inst->src[1] = surface;
858 inst->src[2] = surface;
859
860 int param_base;
861
862 if (devinfo->gen >= 9) {
863 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
864 vec4_instruction *header_inst = new(mem_ctx)
865 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
866 dst_reg(MRF, inst->base_mrf));
867
868 emit(header_inst);
869
870 inst->mlen = 2;
871 inst->header_size = 1;
872 param_base = inst->base_mrf + 1;
873 } else {
874 inst->mlen = 1;
875 param_base = inst->base_mrf;
876 }
877
878 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
879 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
880 int zero_mask = 0xf & ~coord_mask;
881
882 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
883 coordinate));
884
885 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
886 brw_imm_d(0)));
887
888 emit(inst);
889 return src_reg(inst->dst);
890 }
891
892 bool
is_high_sampler(src_reg sampler)893 vec4_visitor::is_high_sampler(src_reg sampler)
894 {
895 if (devinfo->gen < 8 && !devinfo->is_haswell)
896 return false;
897
898 return sampler.file != IMM || sampler.ud >= 16;
899 }
900
901 void
emit_texture(ir_texture_opcode op,dst_reg dest,const glsl_type * dest_type,src_reg coordinate,int coord_components,src_reg shadow_comparator,src_reg lod,src_reg lod2,src_reg sample_index,uint32_t constant_offset,src_reg offset_value,src_reg mcs,uint32_t surface,src_reg surface_reg,src_reg sampler_reg)902 vec4_visitor::emit_texture(ir_texture_opcode op,
903 dst_reg dest,
904 const glsl_type *dest_type,
905 src_reg coordinate,
906 int coord_components,
907 src_reg shadow_comparator,
908 src_reg lod, src_reg lod2,
909 src_reg sample_index,
910 uint32_t constant_offset,
911 src_reg offset_value,
912 src_reg mcs,
913 uint32_t surface,
914 src_reg surface_reg,
915 src_reg sampler_reg)
916 {
917 /* The sampler can only meaningfully compute LOD for fragment shader
918 * messages. For all other stages, we change the opcode to TXL and hardcode
919 * the LOD to 0.
920 *
921 * textureQueryLevels() is implemented in terms of TXS so we need to pass a
922 * valid LOD argument.
923 */
924 if (op == ir_tex || op == ir_query_levels) {
925 assert(lod.file == BAD_FILE);
926 lod = brw_imm_f(0.0f);
927 }
928
929 enum opcode opcode;
930 switch (op) {
931 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
932 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
933 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
934 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
935 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
936 SHADER_OPCODE_TXF_CMS); break;
937 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
938 case ir_tg4: opcode = offset_value.file != BAD_FILE
939 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
940 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
941 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
942 case ir_txb:
943 unreachable("TXB is not valid for vertex shaders.");
944 case ir_lod:
945 unreachable("LOD is not valid for vertex shaders.");
946 case ir_samples_identical: {
947 /* There are some challenges implementing this for vec4, and it seems
948 * unlikely to be used anyway. For now, just return false ways.
949 */
950 emit(MOV(dest, brw_imm_ud(0u)));
951 return;
952 }
953 default:
954 unreachable("Unrecognized tex op");
955 }
956
957 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
958
959 inst->offset = constant_offset;
960
961 /* The message header is necessary for:
962 * - Gen4 (always)
963 * - Gen9+ for selecting SIMD4x2
964 * - Texel offsets
965 * - Gather channel selection
966 * - Sampler indices too large to fit in a 4-bit value.
967 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
968 */
969 inst->header_size =
970 (devinfo->gen < 5 || devinfo->gen >= 9 ||
971 inst->offset != 0 || op == ir_tg4 ||
972 op == ir_texture_samples ||
973 is_high_sampler(sampler_reg)) ? 1 : 0;
974 inst->base_mrf = 2;
975 inst->mlen = inst->header_size;
976 inst->dst.writemask = WRITEMASK_XYZW;
977 inst->shadow_compare = shadow_comparator.file != BAD_FILE;
978
979 inst->src[1] = surface_reg;
980 inst->src[2] = sampler_reg;
981
982 /* MRF for the first parameter */
983 int param_base = inst->base_mrf + inst->header_size;
984
985 if (op == ir_txs || op == ir_query_levels) {
986 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
987 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
988 inst->mlen++;
989 } else if (op == ir_texture_samples) {
990 inst->dst.writemask = WRITEMASK_X;
991 } else {
992 /* Load the coordinate */
993 /* FINISHME: gl_clamp_mask and saturate */
994 int coord_mask = (1 << coord_components) - 1;
995 int zero_mask = 0xf & ~coord_mask;
996
997 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
998 coordinate));
999 inst->mlen++;
1000
1001 if (zero_mask != 0) {
1002 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1003 brw_imm_d(0)));
1004 }
1005 /* Load the shadow comparator */
1006 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1007 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1008 WRITEMASK_X),
1009 shadow_comparator));
1010 inst->mlen++;
1011 }
1012
1013 /* Load the LOD info */
1014 if (op == ir_tex || op == ir_txl) {
1015 int mrf, writemask;
1016 if (devinfo->gen >= 5) {
1017 mrf = param_base + 1;
1018 if (shadow_comparator.file != BAD_FILE) {
1019 writemask = WRITEMASK_Y;
1020 /* mlen already incremented */
1021 } else {
1022 writemask = WRITEMASK_X;
1023 inst->mlen++;
1024 }
1025 } else /* devinfo->gen == 4 */ {
1026 mrf = param_base;
1027 writemask = WRITEMASK_W;
1028 }
1029 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1030 } else if (op == ir_txf) {
1031 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1032 } else if (op == ir_txf_ms) {
1033 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1034 sample_index));
1035 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1036 /* MCS data is stored in the first two channels of ‘mcs’, but we
1037 * need to get it into the .y and .z channels of the second vec4
1038 * of params.
1039 */
1040 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1041 emit(MOV(dst_reg(MRF, param_base + 1,
1042 glsl_type::uint_type, WRITEMASK_YZ),
1043 mcs));
1044 } else if (devinfo->gen >= 7) {
1045 /* MCS data is in the first channel of `mcs`, but we need to get it into
1046 * the .y channel of the second vec4 of params, so replicate .x across
1047 * the whole vec4 and then mask off everything except .y
1048 */
1049 mcs.swizzle = BRW_SWIZZLE_XXXX;
1050 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1051 mcs));
1052 }
1053 inst->mlen++;
1054 } else if (op == ir_txd) {
1055 const brw_reg_type type = lod.type;
1056
1057 if (devinfo->gen >= 5) {
1058 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1059 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1060 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1061 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1062 inst->mlen++;
1063
1064 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1065 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1066 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1067 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1068 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1069 inst->mlen++;
1070
1071 if (shadow_comparator.file != BAD_FILE) {
1072 emit(MOV(dst_reg(MRF, param_base + 2,
1073 shadow_comparator.type, WRITEMASK_Z),
1074 shadow_comparator));
1075 }
1076 }
1077 } else /* devinfo->gen == 4 */ {
1078 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1079 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1080 inst->mlen += 2;
1081 }
1082 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1083 if (shadow_comparator.file != BAD_FILE) {
1084 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1085 shadow_comparator));
1086 }
1087
1088 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1089 offset_value));
1090 inst->mlen++;
1091 }
1092 }
1093
1094 emit(inst);
1095
1096 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1097 * spec requires layers.
1098 */
1099 if (op == ir_txs && devinfo->gen < 7) {
1100 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1101 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1102 src_reg(inst->dst), brw_imm_d(1));
1103 }
1104
1105 if (devinfo->gen == 6 && op == ir_tg4) {
1106 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1107 }
1108
1109 if (op == ir_query_levels) {
1110 /* # levels is in .w */
1111 src_reg swizzled(dest);
1112 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1113 SWIZZLE_W, SWIZZLE_W);
1114 emit(MOV(dest, swizzled));
1115 }
1116 }
1117
1118 /**
1119 * Apply workarounds for Gen6 gather with UINT/SINT
1120 */
1121 void
emit_gen6_gather_wa(uint8_t wa,dst_reg dst)1122 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1123 {
1124 if (!wa)
1125 return;
1126
1127 int width = (wa & WA_8BIT) ? 8 : 16;
1128 dst_reg dst_f = dst;
1129 dst_f.type = BRW_REGISTER_TYPE_F;
1130
1131 /* Convert from UNORM to UINT */
1132 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1133 emit(MOV(dst, src_reg(dst_f)));
1134
1135 if (wa & WA_SIGN) {
1136 /* Reinterpret the UINT value as a signed INT value by
1137 * shifting the sign bit into place, then shifting back
1138 * preserving sign.
1139 */
1140 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1141 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1142 }
1143 }
1144
1145 void
gs_emit_vertex(int)1146 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1147 {
1148 unreachable("not reached");
1149 }
1150
1151 void
gs_end_primitive()1152 vec4_visitor::gs_end_primitive()
1153 {
1154 unreachable("not reached");
1155 }
1156
1157 void
emit_ndc_computation()1158 vec4_visitor::emit_ndc_computation()
1159 {
1160 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1161 return;
1162
1163 /* Get the position */
1164 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1165
1166 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1167 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1168 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1169 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1170
1171 current_annotation = "NDC";
1172 dst_reg ndc_w = ndc;
1173 ndc_w.writemask = WRITEMASK_W;
1174 src_reg pos_w = pos;
1175 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1176 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1177
1178 dst_reg ndc_xyz = ndc;
1179 ndc_xyz.writemask = WRITEMASK_XYZ;
1180
1181 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1182 }
1183
1184 void
emit_psiz_and_flags(dst_reg reg)1185 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1186 {
1187 if (devinfo->gen < 6 &&
1188 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1189 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1190 devinfo->has_negative_rhw_bug)) {
1191 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1192 dst_reg header1_w = header1;
1193 header1_w.writemask = WRITEMASK_W;
1194
1195 emit(MOV(header1, brw_imm_ud(0u)));
1196
1197 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1198 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1199
1200 current_annotation = "Point size";
1201 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1202 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1203 }
1204
1205 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1206 current_annotation = "Clipping flags";
1207 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1208 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1209
1210 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1211 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1212 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1213
1214 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1215 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1216 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1217 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1218 }
1219
1220 /* i965 clipping workaround:
1221 * 1) Test for -ve rhw
1222 * 2) If set,
1223 * set ndc = (0,0,0,0)
1224 * set ucp[6] = 1
1225 *
1226 * Later, clipping will detect ucp[6] and ensure the primitive is
1227 * clipped against all fixed planes.
1228 */
1229 if (devinfo->has_negative_rhw_bug &&
1230 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1231 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1232 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1233 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1234 vec4_instruction *inst;
1235 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1236 inst->predicate = BRW_PREDICATE_NORMAL;
1237 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1238 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1239 inst->predicate = BRW_PREDICATE_NORMAL;
1240 }
1241
1242 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1243 } else if (devinfo->gen < 6) {
1244 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1245 } else {
1246 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1247 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1248 dst_reg reg_w = reg;
1249 reg_w.writemask = WRITEMASK_W;
1250 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1251 reg_as_src.type = reg_w.type;
1252 reg_as_src.swizzle = brw_swizzle_for_size(1);
1253 emit(MOV(reg_w, reg_as_src));
1254 }
1255 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1256 dst_reg reg_y = reg;
1257 reg_y.writemask = WRITEMASK_Y;
1258 reg_y.type = BRW_REGISTER_TYPE_D;
1259 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1260 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1261 }
1262 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1263 dst_reg reg_z = reg;
1264 reg_z.writemask = WRITEMASK_Z;
1265 reg_z.type = BRW_REGISTER_TYPE_D;
1266 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1267 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1268 }
1269 }
1270 }
1271
1272 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)1273 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1274 {
1275 assert(varying < VARYING_SLOT_MAX);
1276
1277 unsigned num_comps = output_num_components[varying][component];
1278 if (num_comps == 0)
1279 return NULL;
1280
1281 assert(output_reg[varying][component].type == reg.type);
1282 current_annotation = output_reg_annotation[varying];
1283 if (output_reg[varying][component].file != BAD_FILE) {
1284 src_reg src = src_reg(output_reg[varying][component]);
1285 src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1286 reg.writemask =
1287 brw_writemask_for_component_packing(num_comps, component);
1288 return emit(MOV(reg, src));
1289 }
1290 return NULL;
1291 }
1292
1293 void
emit_urb_slot(dst_reg reg,int varying)1294 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1295 {
1296 reg.type = BRW_REGISTER_TYPE_F;
1297 output_reg[varying][0].type = reg.type;
1298
1299 switch (varying) {
1300 case VARYING_SLOT_PSIZ:
1301 {
1302 /* PSIZ is always in slot 0, and is coupled with other flags. */
1303 current_annotation = "indices, point width, clip flags";
1304 emit_psiz_and_flags(reg);
1305 break;
1306 }
1307 case BRW_VARYING_SLOT_NDC:
1308 current_annotation = "NDC";
1309 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1310 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1311 break;
1312 case VARYING_SLOT_POS:
1313 current_annotation = "gl_Position";
1314 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1315 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1316 break;
1317 case VARYING_SLOT_EDGE:
1318 /* This is present when doing unfilled polygons. We're supposed to copy
1319 * the edge flag from the user-provided vertex array
1320 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1321 * of that attribute (starts as 1.0f). This is then used in clipping to
1322 * determine which edges should be drawn as wireframe.
1323 */
1324 current_annotation = "edge flag";
1325 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1326 glsl_type::float_type, WRITEMASK_XYZW))));
1327 break;
1328 case BRW_VARYING_SLOT_PAD:
1329 /* No need to write to this slot */
1330 break;
1331 default:
1332 for (int i = 0; i < 4; i++) {
1333 emit_generic_urb_slot(reg, varying, i);
1334 }
1335 break;
1336 }
1337 }
1338
1339 static int
align_interleaved_urb_mlen(const struct gen_device_info * devinfo,int mlen)1340 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1341 {
1342 if (devinfo->gen >= 6) {
1343 /* URB data written (does not include the message header reg) must
1344 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1345 * section 5.4.3.2.2: URB_INTERLEAVED.
1346 *
1347 * URB entries are allocated on a multiple of 1024 bits, so an
1348 * extra 128 bits written here to make the end align to 256 is
1349 * no problem.
1350 */
1351 if ((mlen % 2) != 1)
1352 mlen++;
1353 }
1354
1355 return mlen;
1356 }
1357
1358
1359 /**
1360 * Generates the VUE payload plus the necessary URB write instructions to
1361 * output it.
1362 *
1363 * The VUE layout is documented in Volume 2a.
1364 */
1365 void
emit_vertex()1366 vec4_visitor::emit_vertex()
1367 {
1368 /* MRF 0 is reserved for the debugger, so start with message header
1369 * in MRF 1.
1370 */
1371 int base_mrf = 1;
1372 int mrf = base_mrf;
1373 /* In the process of generating our URB write message contents, we
1374 * may need to unspill a register or load from an array. Those
1375 * reads would use MRFs 14-15.
1376 */
1377 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1378
1379 /* The following assertion verifies that max_usable_mrf causes an
1380 * even-numbered amount of URB write data, which will meet gen6's
1381 * requirements for length alignment.
1382 */
1383 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1384
1385 /* First mrf is the g0-based message header containing URB handles and
1386 * such.
1387 */
1388 emit_urb_write_header(mrf++);
1389
1390 if (devinfo->gen < 6) {
1391 emit_ndc_computation();
1392 }
1393
1394 /* We may need to split this up into several URB writes, so do them in a
1395 * loop.
1396 */
1397 int slot = 0;
1398 bool complete = false;
1399 do {
1400 /* URB offset is in URB row increments, and each of our MRFs is half of
1401 * one of those, since we're doing interleaved writes.
1402 */
1403 int offset = slot / 2;
1404
1405 mrf = base_mrf + 1;
1406 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1407 emit_urb_slot(dst_reg(MRF, mrf++),
1408 prog_data->vue_map.slot_to_varying[slot]);
1409
1410 /* If this was max_usable_mrf, we can't fit anything more into this
1411 * URB WRITE. Same thing if we reached the maximum length available.
1412 */
1413 if (mrf > max_usable_mrf ||
1414 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1415 slot++;
1416 break;
1417 }
1418 }
1419
1420 complete = slot >= prog_data->vue_map.num_slots;
1421 current_annotation = "URB write";
1422 vec4_instruction *inst = emit_urb_write_opcode(complete);
1423 inst->base_mrf = base_mrf;
1424 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1425 inst->offset += offset;
1426 } while(!complete);
1427 }
1428
1429
1430 src_reg
get_scratch_offset(bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1431 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1432 src_reg *reladdr, int reg_offset)
1433 {
1434 /* Because we store the values to scratch interleaved like our
1435 * vertex data, we need to scale the vec4 index by 2.
1436 */
1437 int message_header_scale = 2;
1438
1439 /* Pre-gen6, the message header uses byte offsets instead of vec4
1440 * (16-byte) offset units.
1441 */
1442 if (devinfo->gen < 6)
1443 message_header_scale *= 16;
1444
1445 if (reladdr) {
1446 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1447 * to multiply the reladdr by 2. Notice that the reg_offset part
1448 * is in units of 16 bytes and is used to select the low/high 16-byte
1449 * chunk of a full dvec4, so we don't want to multiply that part.
1450 */
1451 src_reg index = src_reg(this, glsl_type::int_type);
1452 if (type_sz(inst->dst.type) < 8) {
1453 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1454 brw_imm_d(reg_offset)));
1455 emit_before(block, inst, MUL(dst_reg(index), index,
1456 brw_imm_d(message_header_scale)));
1457 } else {
1458 emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1459 brw_imm_d(message_header_scale * 2)));
1460 emit_before(block, inst, ADD(dst_reg(index), index,
1461 brw_imm_d(reg_offset * message_header_scale)));
1462 }
1463 return index;
1464 } else {
1465 return brw_imm_d(reg_offset * message_header_scale);
1466 }
1467 }
1468
1469 /**
1470 * Emits an instruction before @inst to load the value named by @orig_src
1471 * from scratch space at @base_offset to @temp.
1472 *
1473 * @base_offset is measured in 32-byte units (the size of a register).
1474 */
1475 void
emit_scratch_read(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1476 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1477 dst_reg temp, src_reg orig_src,
1478 int base_offset)
1479 {
1480 assert(orig_src.offset % REG_SIZE == 0);
1481 int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1482 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1483 reg_offset);
1484
1485 if (type_sz(orig_src.type) < 8) {
1486 emit_before(block, inst, SCRATCH_READ(temp, index));
1487 } else {
1488 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1489 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1490 emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1491 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1492 vec4_instruction *last_read =
1493 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1494 emit_before(block, inst, last_read);
1495 shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1496 }
1497 }
1498
1499 /**
1500 * Emits an instruction after @inst to store the value to be written
1501 * to @orig_dst to scratch space at @base_offset, from @temp.
1502 *
1503 * @base_offset is measured in 32-byte units (the size of a register).
1504 */
1505 void
emit_scratch_write(bblock_t * block,vec4_instruction * inst,int base_offset)1506 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1507 int base_offset)
1508 {
1509 assert(inst->dst.offset % REG_SIZE == 0);
1510 int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1511 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1512 reg_offset);
1513
1514 /* Create a temporary register to store *inst's result in.
1515 *
1516 * We have to be careful in MOVing from our temporary result register in
1517 * the scratch write. If we swizzle from channels of the temporary that
1518 * weren't initialized, it will confuse live interval analysis, which will
1519 * make spilling fail to make progress.
1520 */
1521 bool is_64bit = type_sz(inst->dst.type) == 8;
1522 const glsl_type *alloc_type =
1523 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1524 const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1525 inst->dst.type),
1526 brw_swizzle_for_mask(inst->dst.writemask));
1527
1528 if (!is_64bit) {
1529 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1530 inst->dst.writemask));
1531 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1532 if (inst->opcode != BRW_OPCODE_SEL)
1533 write->predicate = inst->predicate;
1534 write->ir = inst->ir;
1535 write->annotation = inst->annotation;
1536 inst->insert_after(block, write);
1537 } else {
1538 dst_reg shuffled = dst_reg(this, alloc_type);
1539 vec4_instruction *last =
1540 shuffle_64bit_data(shuffled, temp, true, block, inst);
1541 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1542
1543 uint8_t mask = 0;
1544 if (inst->dst.writemask & WRITEMASK_X)
1545 mask |= WRITEMASK_XY;
1546 if (inst->dst.writemask & WRITEMASK_Y)
1547 mask |= WRITEMASK_ZW;
1548 if (mask) {
1549 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1550
1551 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1552 if (inst->opcode != BRW_OPCODE_SEL)
1553 write->predicate = inst->predicate;
1554 write->ir = inst->ir;
1555 write->annotation = inst->annotation;
1556 last->insert_after(block, write);
1557 }
1558
1559 mask = 0;
1560 if (inst->dst.writemask & WRITEMASK_Z)
1561 mask |= WRITEMASK_XY;
1562 if (inst->dst.writemask & WRITEMASK_W)
1563 mask |= WRITEMASK_ZW;
1564 if (mask) {
1565 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1566
1567 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1568 reg_offset + 1);
1569 vec4_instruction *write =
1570 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1571 if (inst->opcode != BRW_OPCODE_SEL)
1572 write->predicate = inst->predicate;
1573 write->ir = inst->ir;
1574 write->annotation = inst->annotation;
1575 last->insert_after(block, write);
1576 }
1577 }
1578
1579 inst->dst.file = temp.file;
1580 inst->dst.nr = temp.nr;
1581 inst->dst.offset %= REG_SIZE;
1582 inst->dst.reladdr = NULL;
1583 }
1584
1585 /**
1586 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1587 * adds the scratch read(s) before \p inst. The function also checks for
1588 * recursive reladdr scratch accesses, issuing the corresponding scratch
1589 * loads and rewriting reladdr references accordingly.
1590 *
1591 * \return \p src if it did not require a scratch load, otherwise, the
1592 * register holding the result of the scratch load that the caller should
1593 * use to rewrite src.
1594 */
1595 src_reg
emit_resolve_reladdr(int scratch_loc[],bblock_t * block,vec4_instruction * inst,src_reg src)1596 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1597 vec4_instruction *inst, src_reg src)
1598 {
1599 /* Resolve recursive reladdr scratch access by calling ourselves
1600 * with src.reladdr
1601 */
1602 if (src.reladdr)
1603 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1604 *src.reladdr);
1605
1606 /* Now handle scratch access on src */
1607 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1608 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1609 glsl_type::dvec4_type : glsl_type::vec4_type);
1610 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1611 src.nr = temp.nr;
1612 src.offset %= REG_SIZE;
1613 src.reladdr = NULL;
1614 }
1615
1616 return src;
1617 }
1618
1619 /**
1620 * We can't generally support array access in GRF space, because a
1621 * single instruction's destination can only span 2 contiguous
1622 * registers. So, we send all GRF arrays that get variable index
1623 * access to scratch space.
1624 */
1625 void
move_grf_array_access_to_scratch()1626 vec4_visitor::move_grf_array_access_to_scratch()
1627 {
1628 int scratch_loc[this->alloc.count];
1629 memset(scratch_loc, -1, sizeof(scratch_loc));
1630
1631 /* First, calculate the set of virtual GRFs that need to be punted
1632 * to scratch due to having any array access on them, and where in
1633 * scratch.
1634 */
1635 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1636 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1637 if (scratch_loc[inst->dst.nr] == -1) {
1638 scratch_loc[inst->dst.nr] = last_scratch;
1639 last_scratch += this->alloc.sizes[inst->dst.nr];
1640 }
1641
1642 for (src_reg *iter = inst->dst.reladdr;
1643 iter->reladdr;
1644 iter = iter->reladdr) {
1645 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1646 scratch_loc[iter->nr] = last_scratch;
1647 last_scratch += this->alloc.sizes[iter->nr];
1648 }
1649 }
1650 }
1651
1652 for (int i = 0 ; i < 3; i++) {
1653 for (src_reg *iter = &inst->src[i];
1654 iter->reladdr;
1655 iter = iter->reladdr) {
1656 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1657 scratch_loc[iter->nr] = last_scratch;
1658 last_scratch += this->alloc.sizes[iter->nr];
1659 }
1660 }
1661 }
1662 }
1663
1664 /* Now, for anything that will be accessed through scratch, rewrite
1665 * it to load/store. Note that this is a _safe list walk, because
1666 * we may generate a new scratch_write instruction after the one
1667 * we're processing.
1668 */
1669 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1670 /* Set up the annotation tracking for new generated instructions. */
1671 base_ir = inst->ir;
1672 current_annotation = inst->annotation;
1673
1674 /* First handle scratch access on the dst. Notice we have to handle
1675 * the case where the dst's reladdr also points to scratch space.
1676 */
1677 if (inst->dst.reladdr)
1678 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1679 *inst->dst.reladdr);
1680
1681 /* Now that we have handled any (possibly recursive) reladdr scratch
1682 * accesses for dst we can safely do the scratch write for dst itself
1683 */
1684 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1685 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1686
1687 /* Now handle scratch access on any src. In this case, since inst->src[i]
1688 * already is a src_reg, we can just call emit_resolve_reladdr with
1689 * inst->src[i] and it will take care of handling scratch loads for
1690 * both src and src.reladdr (recursively).
1691 */
1692 for (int i = 0 ; i < 3; i++) {
1693 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1694 inst->src[i]);
1695 }
1696 }
1697 }
1698
1699 /**
1700 * Emits an instruction before @inst to load the value named by @orig_src
1701 * from the pull constant buffer (surface) at @base_offset to @temp.
1702 */
1703 void
emit_pull_constant_load(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset,src_reg indirect)1704 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1705 dst_reg temp, src_reg orig_src,
1706 int base_offset, src_reg indirect)
1707 {
1708 assert(orig_src.offset % 16 == 0);
1709 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1710
1711 /* For 64bit loads we need to emit two 32-bit load messages and we also
1712 * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1713 * that we emit the 32-bit loads into a temporary and we shuffle the result
1714 * into the original destination.
1715 */
1716 dst_reg orig_temp = temp;
1717 bool is_64bit = type_sz(orig_src.type) == 8;
1718 if (is_64bit) {
1719 assert(type_sz(temp.type) == 8);
1720 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1721 temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1722 }
1723
1724 src_reg src = orig_src;
1725 for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1726 int reg_offset = base_offset + src.offset / 16;
1727
1728 src_reg offset;
1729 if (indirect.file != BAD_FILE) {
1730 offset = src_reg(this, glsl_type::uint_type);
1731 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1732 brw_imm_ud(reg_offset * 16)));
1733 } else if (devinfo->gen >= 8) {
1734 /* Store the offset in a GRF so we can send-from-GRF. */
1735 offset = src_reg(this, glsl_type::uint_type);
1736 emit_before(block, inst, MOV(dst_reg(offset),
1737 brw_imm_ud(reg_offset * 16)));
1738 } else {
1739 offset = brw_imm_d(reg_offset * 16);
1740 }
1741
1742 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1743 brw_imm_ud(index),
1744 offset,
1745 block, inst);
1746
1747 src = byte_offset(src, 16);
1748 }
1749
1750 brw_mark_surface_used(&prog_data->base, index);
1751
1752 if (is_64bit) {
1753 temp = retype(temp, BRW_REGISTER_TYPE_DF);
1754 shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1755 }
1756 }
1757
1758 /**
1759 * Implements array access of uniforms by inserting a
1760 * PULL_CONSTANT_LOAD instruction.
1761 *
1762 * Unlike temporary GRF array access (where we don't support it due to
1763 * the difficulty of doing relative addressing on instruction
1764 * destinations), we could potentially do array access of uniforms
1765 * that were loaded in GRF space as push constants. In real-world
1766 * usage we've seen, though, the arrays being used are always larger
1767 * than we could load as push constants, so just always move all
1768 * uniform array access out to a pull constant buffer.
1769 */
1770 void
move_uniform_array_access_to_pull_constants()1771 vec4_visitor::move_uniform_array_access_to_pull_constants()
1772 {
1773 /* The vulkan dirver doesn't support pull constants other than UBOs so
1774 * everything has to be pushed regardless.
1775 */
1776 if (stage_prog_data->pull_param == NULL) {
1777 split_uniform_registers();
1778 return;
1779 }
1780
1781 int pull_constant_loc[this->uniforms];
1782 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1783
1784 /* First, walk through the instructions and determine which things need to
1785 * be pulled. We mark something as needing to be pulled by setting
1786 * pull_constant_loc to 0.
1787 */
1788 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1789 /* We only care about MOV_INDIRECT of a uniform */
1790 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1791 inst->src[0].file != UNIFORM)
1792 continue;
1793
1794 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1795
1796 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1797 pull_constant_loc[uniform_nr + j] = 0;
1798 }
1799
1800 /* Next, we walk the list of uniforms and assign real pull constant
1801 * locations and set their corresponding entries in pull_param.
1802 */
1803 for (int j = 0; j < this->uniforms; j++) {
1804 if (pull_constant_loc[j] < 0)
1805 continue;
1806
1807 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1808
1809 for (int i = 0; i < 4; i++) {
1810 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1811 = stage_prog_data->param[j * 4 + i];
1812 }
1813 }
1814
1815 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1816 * instructions to actual uniform pulls.
1817 */
1818 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1819 /* We only care about MOV_INDIRECT of a uniform */
1820 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1821 inst->src[0].file != UNIFORM)
1822 continue;
1823
1824 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1825
1826 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1827
1828 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1829 pull_constant_loc[uniform_nr], inst->src[1]);
1830 inst->remove(block);
1831 }
1832
1833 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1834 * no need to track them as larger-than-vec4 objects. This will be
1835 * relied on in cutting out unused uniform vectors from push
1836 * constants.
1837 */
1838 split_uniform_registers();
1839 }
1840
1841 void
resolve_ud_negate(src_reg * reg)1842 vec4_visitor::resolve_ud_negate(src_reg *reg)
1843 {
1844 if (reg->type != BRW_REGISTER_TYPE_UD ||
1845 !reg->negate)
1846 return;
1847
1848 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1849 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1850 *reg = temp;
1851 }
1852
vec4_visitor(const struct brw_compiler * compiler,void * log_data,const struct brw_sampler_prog_key_data * key_tex,struct brw_vue_prog_data * prog_data,const nir_shader * shader,void * mem_ctx,bool no_spills,int shader_time_index)1853 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1854 void *log_data,
1855 const struct brw_sampler_prog_key_data *key_tex,
1856 struct brw_vue_prog_data *prog_data,
1857 const nir_shader *shader,
1858 void *mem_ctx,
1859 bool no_spills,
1860 int shader_time_index)
1861 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1862 key_tex(key_tex),
1863 prog_data(prog_data),
1864 fail_msg(NULL),
1865 first_non_payload_grf(0),
1866 need_all_constants_in_pull_buffer(false),
1867 no_spills(no_spills),
1868 shader_time_index(shader_time_index),
1869 last_scratch(0)
1870 {
1871 this->failed = false;
1872
1873 this->base_ir = NULL;
1874 this->current_annotation = NULL;
1875 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1876
1877 memset(this->output_num_components, 0, sizeof(this->output_num_components));
1878
1879 this->virtual_grf_start = NULL;
1880 this->virtual_grf_end = NULL;
1881 this->live_intervals = NULL;
1882
1883 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1884
1885 this->uniforms = 0;
1886 }
1887
~vec4_visitor()1888 vec4_visitor::~vec4_visitor()
1889 {
1890 }
1891
1892
1893 void
fail(const char * format,...)1894 vec4_visitor::fail(const char *format, ...)
1895 {
1896 va_list va;
1897 char *msg;
1898
1899 if (failed)
1900 return;
1901
1902 failed = true;
1903
1904 va_start(va, format);
1905 msg = ralloc_vasprintf(mem_ctx, format, va);
1906 va_end(va);
1907 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1908
1909 this->fail_msg = msg;
1910
1911 if (debug_enabled) {
1912 fprintf(stderr, "%s", msg);
1913 }
1914 }
1915
1916 } /* namespace brw */
1917