• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/glsl/ir.h"
25 #include "brw_fs.h"
26 #include "brw_fs_surface_builder.h"
27 #include "brw_nir.h"
28 #include "brw_program.h"
29 
30 using namespace brw;
31 using namespace brw::surface_access;
32 
33 void
emit_nir_code()34 fs_visitor::emit_nir_code()
35 {
36    /* emit the arrays used for inputs and outputs - load/store intrinsics will
37     * be converted to reads/writes of these arrays
38     */
39    nir_setup_outputs();
40    nir_setup_uniforms();
41    nir_emit_system_values();
42 
43    /* get the main function and emit it */
44    nir_foreach_function(function, nir) {
45       assert(strcmp(function->name, "main") == 0);
46       assert(function->impl);
47       nir_emit_impl(function->impl);
48    }
49 }
50 
51 void
nir_setup_outputs()52 fs_visitor::nir_setup_outputs()
53 {
54    if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
55       return;
56 
57    nir_foreach_variable(var, &nir->outputs) {
58       const unsigned vec4s =
59          var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
60                            : type_size_vec4(var->type);
61       fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s);
62       for (unsigned i = 0; i < vec4s; i++) {
63          if (outputs[var->data.driver_location + i].file == BAD_FILE)
64             outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i);
65       }
66    }
67 }
68 
69 void
nir_setup_uniforms()70 fs_visitor::nir_setup_uniforms()
71 {
72    if (dispatch_width != min_dispatch_width)
73       return;
74 
75    uniforms = nir->num_uniforms / 4;
76 }
77 
78 static bool
emit_system_values_block(nir_block * block,fs_visitor * v)79 emit_system_values_block(nir_block *block, fs_visitor *v)
80 {
81    fs_reg *reg;
82 
83    nir_foreach_instr(instr, block) {
84       if (instr->type != nir_instr_type_intrinsic)
85          continue;
86 
87       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
88       switch (intrin->intrinsic) {
89       case nir_intrinsic_load_vertex_id:
90          unreachable("should be lowered by lower_vertex_id().");
91 
92       case nir_intrinsic_load_vertex_id_zero_base:
93          assert(v->stage == MESA_SHADER_VERTEX);
94          reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
95          if (reg->file == BAD_FILE)
96             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
97          break;
98 
99       case nir_intrinsic_load_base_vertex:
100          assert(v->stage == MESA_SHADER_VERTEX);
101          reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
102          if (reg->file == BAD_FILE)
103             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
104          break;
105 
106       case nir_intrinsic_load_instance_id:
107          assert(v->stage == MESA_SHADER_VERTEX);
108          reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
109          if (reg->file == BAD_FILE)
110             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
111          break;
112 
113       case nir_intrinsic_load_base_instance:
114          assert(v->stage == MESA_SHADER_VERTEX);
115          reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
116          if (reg->file == BAD_FILE)
117             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
118          break;
119 
120       case nir_intrinsic_load_draw_id:
121          assert(v->stage == MESA_SHADER_VERTEX);
122          reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
123          if (reg->file == BAD_FILE)
124             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
125          break;
126 
127       case nir_intrinsic_load_invocation_id:
128          if (v->stage == MESA_SHADER_TESS_CTRL)
129             break;
130          assert(v->stage == MESA_SHADER_GEOMETRY);
131          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
132          if (reg->file == BAD_FILE) {
133             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
134             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
135             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
136             abld.SHR(iid, g1, brw_imm_ud(27u));
137             *reg = iid;
138          }
139          break;
140 
141       case nir_intrinsic_load_sample_pos:
142          assert(v->stage == MESA_SHADER_FRAGMENT);
143          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
144          if (reg->file == BAD_FILE)
145             *reg = *v->emit_samplepos_setup();
146          break;
147 
148       case nir_intrinsic_load_sample_id:
149          assert(v->stage == MESA_SHADER_FRAGMENT);
150          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
151          if (reg->file == BAD_FILE)
152             *reg = *v->emit_sampleid_setup();
153          break;
154 
155       case nir_intrinsic_load_sample_mask_in:
156          assert(v->stage == MESA_SHADER_FRAGMENT);
157          assert(v->devinfo->gen >= 7);
158          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
159          if (reg->file == BAD_FILE)
160             *reg = *v->emit_samplemaskin_setup();
161          break;
162 
163       case nir_intrinsic_load_work_group_id:
164          assert(v->stage == MESA_SHADER_COMPUTE);
165          reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
166          if (reg->file == BAD_FILE)
167             *reg = *v->emit_cs_work_group_id_setup();
168          break;
169 
170       case nir_intrinsic_load_helper_invocation:
171          assert(v->stage == MESA_SHADER_FRAGMENT);
172          reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
173          if (reg->file == BAD_FILE) {
174             const fs_builder abld =
175                v->bld.annotate("gl_HelperInvocation", NULL);
176 
177             /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
178              * pixel mask is in g1.7 of the thread payload.
179              *
180              * We move the per-channel pixel enable bit to the low bit of each
181              * channel by shifting the byte containing the pixel mask by the
182              * vector immediate 0x76543210UV.
183              *
184              * The region of <1,8,0> reads only 1 byte (the pixel masks for
185              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
186              * masks for 2 and 3) in SIMD16.
187              */
188             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
189             abld.SHR(shifted,
190                      stride(byte_offset(retype(brw_vec1_grf(1, 0),
191                                                BRW_REGISTER_TYPE_UB), 28),
192                             1, 8, 0),
193                      brw_imm_v(0x76543210));
194 
195             /* A set bit in the pixel mask means the channel is enabled, but
196              * that is the opposite of gl_HelperInvocation so we need to invert
197              * the mask.
198              *
199              * The negate source-modifier bit of logical instructions on Gen8+
200              * performs 1's complement negation, so we can use that instead of
201              * a NOT instruction.
202              */
203             fs_reg inverted = negate(shifted);
204             if (v->devinfo->gen < 8) {
205                inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
206                abld.NOT(inverted, shifted);
207             }
208 
209             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
210              * with 1 and negating.
211              */
212             fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
213             abld.AND(anded, inverted, brw_imm_uw(1));
214 
215             fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
216             abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
217             *reg = dst;
218          }
219          break;
220 
221       default:
222          break;
223       }
224    }
225 
226    return true;
227 }
228 
229 void
nir_emit_system_values()230 fs_visitor::nir_emit_system_values()
231 {
232    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
233    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
234       nir_system_values[i] = fs_reg();
235    }
236 
237    nir_foreach_function(function, nir) {
238       assert(strcmp(function->name, "main") == 0);
239       assert(function->impl);
240       nir_foreach_block(block, function->impl) {
241          emit_system_values_block(block, this);
242       }
243    }
244 }
245 
246 void
nir_emit_impl(nir_function_impl * impl)247 fs_visitor::nir_emit_impl(nir_function_impl *impl)
248 {
249    nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
250    for (unsigned i = 0; i < impl->reg_alloc; i++) {
251       nir_locals[i] = fs_reg();
252    }
253 
254    foreach_list_typed(nir_register, reg, node, &impl->registers) {
255       unsigned array_elems =
256          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
257       unsigned size = array_elems * reg->num_components;
258       const brw_reg_type reg_type =
259          reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
260       nir_locals[reg->index] = bld.vgrf(reg_type, size);
261    }
262 
263    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
264                              impl->ssa_alloc);
265 
266    nir_emit_cf_list(&impl->body);
267 }
268 
269 void
nir_emit_cf_list(exec_list * list)270 fs_visitor::nir_emit_cf_list(exec_list *list)
271 {
272    exec_list_validate(list);
273    foreach_list_typed(nir_cf_node, node, node, list) {
274       switch (node->type) {
275       case nir_cf_node_if:
276          nir_emit_if(nir_cf_node_as_if(node));
277          break;
278 
279       case nir_cf_node_loop:
280          nir_emit_loop(nir_cf_node_as_loop(node));
281          break;
282 
283       case nir_cf_node_block:
284          nir_emit_block(nir_cf_node_as_block(node));
285          break;
286 
287       default:
288          unreachable("Invalid CFG node block");
289       }
290    }
291 }
292 
293 void
nir_emit_if(nir_if * if_stmt)294 fs_visitor::nir_emit_if(nir_if *if_stmt)
295 {
296    /* first, put the condition into f0 */
297    fs_inst *inst = bld.MOV(bld.null_reg_d(),
298                             retype(get_nir_src(if_stmt->condition),
299                                    BRW_REGISTER_TYPE_D));
300    inst->conditional_mod = BRW_CONDITIONAL_NZ;
301 
302    bld.IF(BRW_PREDICATE_NORMAL);
303 
304    nir_emit_cf_list(&if_stmt->then_list);
305 
306    /* note: if the else is empty, dead CF elimination will remove it */
307    bld.emit(BRW_OPCODE_ELSE);
308 
309    nir_emit_cf_list(&if_stmt->else_list);
310 
311    bld.emit(BRW_OPCODE_ENDIF);
312 }
313 
314 void
nir_emit_loop(nir_loop * loop)315 fs_visitor::nir_emit_loop(nir_loop *loop)
316 {
317    bld.emit(BRW_OPCODE_DO);
318 
319    nir_emit_cf_list(&loop->body);
320 
321    bld.emit(BRW_OPCODE_WHILE);
322 }
323 
324 void
nir_emit_block(nir_block * block)325 fs_visitor::nir_emit_block(nir_block *block)
326 {
327    nir_foreach_instr(instr, block) {
328       nir_emit_instr(instr);
329    }
330 }
331 
332 void
nir_emit_instr(nir_instr * instr)333 fs_visitor::nir_emit_instr(nir_instr *instr)
334 {
335    const fs_builder abld = bld.annotate(NULL, instr);
336 
337    switch (instr->type) {
338    case nir_instr_type_alu:
339       nir_emit_alu(abld, nir_instr_as_alu(instr));
340       break;
341 
342    case nir_instr_type_intrinsic:
343       switch (stage) {
344       case MESA_SHADER_VERTEX:
345          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
346          break;
347       case MESA_SHADER_TESS_CTRL:
348          nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
349          break;
350       case MESA_SHADER_TESS_EVAL:
351          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
352          break;
353       case MESA_SHADER_GEOMETRY:
354          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
355          break;
356       case MESA_SHADER_FRAGMENT:
357          nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
358          break;
359       case MESA_SHADER_COMPUTE:
360          nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
361          break;
362       default:
363          unreachable("unsupported shader stage");
364       }
365       break;
366 
367    case nir_instr_type_tex:
368       nir_emit_texture(abld, nir_instr_as_tex(instr));
369       break;
370 
371    case nir_instr_type_load_const:
372       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
373       break;
374 
375    case nir_instr_type_ssa_undef:
376       /* We create a new VGRF for undefs on every use (by handling
377        * them in get_nir_src()), rather than for each definition.
378        * This helps register coalescing eliminate MOVs from undef.
379        */
380       break;
381 
382    case nir_instr_type_jump:
383       nir_emit_jump(abld, nir_instr_as_jump(instr));
384       break;
385 
386    default:
387       unreachable("unknown instruction type");
388    }
389 }
390 
391 /**
392  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
393  * match instr.
394  */
395 bool
optimize_extract_to_float(nir_alu_instr * instr,const fs_reg & result)396 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
397                                       const fs_reg &result)
398 {
399    if (!instr->src[0].src.is_ssa ||
400        !instr->src[0].src.ssa->parent_instr)
401       return false;
402 
403    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
404       return false;
405 
406    nir_alu_instr *src0 =
407       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
408 
409    if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
410        src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
411       return false;
412 
413    nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
414    assert(element != NULL);
415 
416    /* Element type to extract.*/
417    const brw_reg_type type = brw_int_type(
418       src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
419       src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
420 
421    fs_reg op0 = get_nir_src(src0->src[0].src);
422    op0.type = brw_type_for_nir_type(
423       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
424                      nir_src_bit_size(src0->src[0].src)));
425    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
426 
427    set_saturate(instr->dest.saturate,
428                 bld.MOV(result, subscript(op0, type, element->u32[0])));
429    return true;
430 }
431 
432 bool
optimize_frontfacing_ternary(nir_alu_instr * instr,const fs_reg & result)433 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
434                                          const fs_reg &result)
435 {
436    if (!instr->src[0].src.is_ssa ||
437        instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
438       return false;
439 
440    nir_intrinsic_instr *src0 =
441       nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
442 
443    if (src0->intrinsic != nir_intrinsic_load_front_face)
444       return false;
445 
446    nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
447    if (!value1 || fabsf(value1->f32[0]) != 1.0f)
448       return false;
449 
450    nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
451    if (!value2 || fabsf(value2->f32[0]) != 1.0f)
452       return false;
453 
454    fs_reg tmp = vgrf(glsl_type::int_type);
455 
456    if (devinfo->gen >= 6) {
457       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
458       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
459 
460       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
461        *
462        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
463        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
464        *
465        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
466        *
467        * This negation looks like it's safe in practice, because bits 0:4 will
468        * surely be TRIANGLES
469        */
470 
471       if (value1->f32[0] == -1.0f) {
472          g0.negate = true;
473       }
474 
475       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
476              g0, brw_imm_uw(0x3f80));
477    } else {
478       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
479       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
480 
481       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
482        *
483        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
484        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
485        *
486        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
487        *
488        * This negation looks like it's safe in practice, because bits 0:4 will
489        * surely be TRIANGLES
490        */
491 
492       if (value1->f32[0] == -1.0f) {
493          g1_6.negate = true;
494       }
495 
496       bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
497    }
498    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
499 
500    return true;
501 }
502 
503 static void
emit_find_msb_using_lzd(const fs_builder & bld,const fs_reg & result,const fs_reg & src,bool is_signed)504 emit_find_msb_using_lzd(const fs_builder &bld,
505                         const fs_reg &result,
506                         const fs_reg &src,
507                         bool is_signed)
508 {
509    fs_inst *inst;
510    fs_reg temp = src;
511 
512    if (is_signed) {
513       /* LZD of an absolute value source almost always does the right
514        * thing.  There are two problem values:
515        *
516        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
517        *   0.  However, findMSB(int(0x80000000)) == 30.
518        *
519        * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
520        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
521        *
522        *    For a value of zero or negative one, -1 will be returned.
523        *
524        * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
525        *   findMSB(-(1<<x)) should return x-1.
526        *
527        * For all negative number cases, including 0x80000000 and
528        * 0xffffffff, the correct value is obtained from LZD if instead of
529        * negating the (already negative) value the logical-not is used.  A
530        * conditonal logical-not can be achieved in two instructions.
531        */
532       temp = bld.vgrf(BRW_REGISTER_TYPE_D);
533 
534       bld.ASR(temp, src, brw_imm_d(31));
535       bld.XOR(temp, temp, src);
536    }
537 
538    bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
539            retype(temp, BRW_REGISTER_TYPE_UD));
540 
541    /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
542     * from the LSB side. Subtract the result from 31 to convert the MSB
543     * count into an LSB count.  If no bits are set, LZD will return 32.
544     * 31-32 = -1, which is exactly what findMSB() is supposed to return.
545     */
546    inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
547    inst->src[0].negate = true;
548 }
549 
550 void
nir_emit_alu(const fs_builder & bld,nir_alu_instr * instr)551 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
552 {
553    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
554    fs_inst *inst;
555 
556    fs_reg result = get_nir_dest(instr->dest.dest);
557    result.type = brw_type_for_nir_type(
558       (nir_alu_type)(nir_op_infos[instr->op].output_type |
559                      nir_dest_bit_size(instr->dest.dest)));
560 
561    fs_reg op[4];
562    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
563       op[i] = get_nir_src(instr->src[i].src);
564       op[i].type = brw_type_for_nir_type(
565          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
566                         nir_src_bit_size(instr->src[i].src)));
567       op[i].abs = instr->src[i].abs;
568       op[i].negate = instr->src[i].negate;
569    }
570 
571    /* We get a bunch of mov's out of the from_ssa pass and they may still
572     * be vectorized.  We'll handle them as a special-case.  We'll also
573     * handle vecN here because it's basically the same thing.
574     */
575    switch (instr->op) {
576    case nir_op_imov:
577    case nir_op_fmov:
578    case nir_op_vec2:
579    case nir_op_vec3:
580    case nir_op_vec4: {
581       fs_reg temp = result;
582       bool need_extra_copy = false;
583       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
584          if (!instr->src[i].src.is_ssa &&
585              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
586             need_extra_copy = true;
587             temp = bld.vgrf(result.type, 4);
588             break;
589          }
590       }
591 
592       for (unsigned i = 0; i < 4; i++) {
593          if (!(instr->dest.write_mask & (1 << i)))
594             continue;
595 
596          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
597             inst = bld.MOV(offset(temp, bld, i),
598                            offset(op[0], bld, instr->src[0].swizzle[i]));
599          } else {
600             inst = bld.MOV(offset(temp, bld, i),
601                            offset(op[i], bld, instr->src[i].swizzle[0]));
602          }
603          inst->saturate = instr->dest.saturate;
604       }
605 
606       /* In this case the source and destination registers were the same,
607        * so we need to insert an extra set of moves in order to deal with
608        * any swizzling.
609        */
610       if (need_extra_copy) {
611          for (unsigned i = 0; i < 4; i++) {
612             if (!(instr->dest.write_mask & (1 << i)))
613                continue;
614 
615             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
616          }
617       }
618       return;
619    }
620    default:
621       break;
622    }
623 
624    /* At this point, we have dealt with any instruction that operates on
625     * more than a single channel.  Therefore, we can just adjust the source
626     * and destination registers for that channel and emit the instruction.
627     */
628    unsigned channel = 0;
629    if (nir_op_infos[instr->op].output_size == 0) {
630       /* Since NIR is doing the scalarizing for us, we should only ever see
631        * vectorized operations with a single channel.
632        */
633       assert(_mesa_bitcount(instr->dest.write_mask) == 1);
634       channel = ffs(instr->dest.write_mask) - 1;
635 
636       result = offset(result, bld, channel);
637    }
638 
639    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
640       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
641       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
642    }
643 
644    switch (instr->op) {
645    case nir_op_i2f:
646    case nir_op_u2f:
647       if (optimize_extract_to_float(instr, result))
648          return;
649       inst = bld.MOV(result, op[0]);
650       inst->saturate = instr->dest.saturate;
651       break;
652 
653    case nir_op_f2d:
654    case nir_op_i2d:
655    case nir_op_u2d:
656       /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
657        *
658        *    "When source or destination is 64b (...), regioning in Align1
659        *     must follow these rules:
660        *
661        *     1. Source and destination horizontal stride must be aligned to
662        *        the same qword.
663        *     (...)"
664        *
665        * This means that 32-bit to 64-bit conversions need to have the 32-bit
666        * data elements aligned to 64-bit. This restriction does not apply to
667        * BDW and later.
668        */
669       if (devinfo->is_cherryview || devinfo->is_broxton) {
670          fs_reg tmp = bld.vgrf(result.type, 1);
671          tmp = subscript(tmp, op[0].type, 0);
672          inst = bld.MOV(tmp, op[0]);
673          inst = bld.MOV(result, tmp);
674          inst->saturate = instr->dest.saturate;
675          break;
676       }
677       /* fallthrough */
678    case nir_op_d2f:
679    case nir_op_d2i:
680    case nir_op_d2u:
681       inst = bld.MOV(result, op[0]);
682       inst->saturate = instr->dest.saturate;
683       break;
684 
685    case nir_op_f2i:
686    case nir_op_f2u:
687       bld.MOV(result, op[0]);
688       break;
689 
690    case nir_op_fsign: {
691       if (type_sz(op[0].type) < 8) {
692          /* AND(val, 0x80000000) gives the sign bit.
693           *
694           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
695           * zero.
696           */
697          bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
698 
699          fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
700          op[0].type = BRW_REGISTER_TYPE_UD;
701          result.type = BRW_REGISTER_TYPE_UD;
702          bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
703 
704          inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
705          inst->predicate = BRW_PREDICATE_NORMAL;
706          if (instr->dest.saturate) {
707             inst = bld.MOV(result, result);
708             inst->saturate = true;
709          }
710       } else {
711          /* For doubles we do the same but we need to consider:
712           *
713           * - 2-src instructions can't operate with 64-bit immediates
714           * - The sign is encoded in the high 32-bit of each DF
715           * - CMP with DF requires special handling in SIMD16
716           * - We need to produce a DF result.
717           */
718 
719          /* 2-src instructions can't have 64-bit immediates, so put 0.0 in
720           * a register and compare with that.
721           */
722          fs_reg tmp = vgrf(glsl_type::double_type);
723          bld.MOV(tmp, setup_imm_df(bld, 0.0));
724 
725          /* A direct DF CMP using the flag register (null dst) won't work in
726           * SIMD16 because the CMP will be split in two by lower_simd_width,
727           * resulting in two CMP instructions with the same dst (NULL),
728           * leading to dead code elimination of the first one. In SIMD8,
729           * however, there is no need to split the CMP and we can save some
730           * work.
731           */
732          fs_reg dst_tmp = vgrf(glsl_type::double_type);
733          bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ);
734 
735          /* In SIMD16 we want to avoid using a NULL dst register with DF CMP,
736           * so we store the result of the comparison in a vgrf instead and
737           * then we generate a UD comparison from that that won't have to
738           * be split by lower_simd_width. This is what NIR does to handle
739           * double comparisons in the general case.
740           */
741          if (bld.dispatch_width() == 16 ) {
742             fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD);
743             bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0));
744             bld.CMP(bld.null_reg_ud(),
745                     dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
746          }
747 
748          /* Get the high 32-bit of each double component where the sign is */
749          fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
750          bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
751 
752          /* Get the sign bit */
753          bld.AND(result_int, result_int, brw_imm_ud(0x80000000u));
754 
755          /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */
756          inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
757          inst->predicate = BRW_PREDICATE_NORMAL;
758 
759          /* Convert from 32-bit float to 64-bit double */
760          result.type = BRW_REGISTER_TYPE_DF;
761          inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F));
762 
763          if (instr->dest.saturate) {
764             inst = bld.MOV(result, result);
765             inst->saturate = true;
766          }
767       }
768       break;
769    }
770 
771    case nir_op_isign:
772       /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
773        *               -> non-negative val generates 0x00000000.
774        *  Predicated OR sets 1 if val is positive.
775        */
776       assert(nir_dest_bit_size(instr->dest.dest) < 64);
777       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
778       bld.ASR(result, op[0], brw_imm_d(31));
779       inst = bld.OR(result, result, brw_imm_d(1));
780       inst->predicate = BRW_PREDICATE_NORMAL;
781       break;
782 
783    case nir_op_frcp:
784       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
785       inst->saturate = instr->dest.saturate;
786       break;
787 
788    case nir_op_fexp2:
789       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
790       inst->saturate = instr->dest.saturate;
791       break;
792 
793    case nir_op_flog2:
794       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
795       inst->saturate = instr->dest.saturate;
796       break;
797 
798    case nir_op_fsin:
799       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
800       inst->saturate = instr->dest.saturate;
801       break;
802 
803    case nir_op_fcos:
804       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
805       inst->saturate = instr->dest.saturate;
806       break;
807 
808    case nir_op_fddx:
809       if (fs_key->high_quality_derivatives) {
810          inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
811       } else {
812          inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
813       }
814       inst->saturate = instr->dest.saturate;
815       break;
816    case nir_op_fddx_fine:
817       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
818       inst->saturate = instr->dest.saturate;
819       break;
820    case nir_op_fddx_coarse:
821       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
822       inst->saturate = instr->dest.saturate;
823       break;
824    case nir_op_fddy:
825       if (fs_key->high_quality_derivatives) {
826          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
827       } else {
828          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
829       }
830       inst->saturate = instr->dest.saturate;
831       break;
832    case nir_op_fddy_fine:
833       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
834       inst->saturate = instr->dest.saturate;
835       break;
836    case nir_op_fddy_coarse:
837       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
838       inst->saturate = instr->dest.saturate;
839       break;
840 
841    case nir_op_iadd:
842       assert(nir_dest_bit_size(instr->dest.dest) < 64);
843    case nir_op_fadd:
844       inst = bld.ADD(result, op[0], op[1]);
845       inst->saturate = instr->dest.saturate;
846       break;
847 
848    case nir_op_fmul:
849       inst = bld.MUL(result, op[0], op[1]);
850       inst->saturate = instr->dest.saturate;
851       break;
852 
853    case nir_op_imul:
854       assert(nir_dest_bit_size(instr->dest.dest) < 64);
855       bld.MUL(result, op[0], op[1]);
856       break;
857 
858    case nir_op_imul_high:
859    case nir_op_umul_high:
860       assert(nir_dest_bit_size(instr->dest.dest) < 64);
861       bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
862       break;
863 
864    case nir_op_idiv:
865    case nir_op_udiv:
866       assert(nir_dest_bit_size(instr->dest.dest) < 64);
867       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
868       break;
869 
870    case nir_op_uadd_carry:
871       unreachable("Should have been lowered by carry_to_arith().");
872 
873    case nir_op_usub_borrow:
874       unreachable("Should have been lowered by borrow_to_arith().");
875 
876    case nir_op_umod:
877    case nir_op_irem:
878       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
879        * appears that our hardware just does the right thing for signed
880        * remainder.
881        */
882       assert(nir_dest_bit_size(instr->dest.dest) < 64);
883       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
884       break;
885 
886    case nir_op_imod: {
887       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
888       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
889 
890       /* Math instructions don't support conditional mod */
891       inst = bld.MOV(bld.null_reg_d(), result);
892       inst->conditional_mod = BRW_CONDITIONAL_NZ;
893 
894       /* Now, we need to determine if signs of the sources are different.
895        * When we XOR the sources, the top bit is 0 if they are the same and 1
896        * if they are different.  We can then use a conditional modifier to
897        * turn that into a predicate.  This leads us to an XOR.l instruction.
898        *
899        * Technically, according to the PRM, you're not allowed to use .l on a
900        * XOR instruction.  However, emperical experiments and Curro's reading
901        * of the simulator source both indicate that it's safe.
902        */
903       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
904       inst = bld.XOR(tmp, op[0], op[1]);
905       inst->predicate = BRW_PREDICATE_NORMAL;
906       inst->conditional_mod = BRW_CONDITIONAL_L;
907 
908       /* If the result of the initial remainder operation is non-zero and the
909        * two sources have different signs, add in a copy of op[1] to get the
910        * final integer modulus value.
911        */
912       inst = bld.ADD(result, result, op[1]);
913       inst->predicate = BRW_PREDICATE_NORMAL;
914       break;
915    }
916 
917    case nir_op_flt:
918    case nir_op_fge:
919    case nir_op_feq:
920    case nir_op_fne: {
921       fs_reg dest = result;
922       if (nir_src_bit_size(instr->src[0].src) > 32) {
923          dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
924       }
925       brw_conditional_mod cond;
926       switch (instr->op) {
927       case nir_op_flt:
928          cond = BRW_CONDITIONAL_L;
929          break;
930       case nir_op_fge:
931          cond = BRW_CONDITIONAL_GE;
932          break;
933       case nir_op_feq:
934          cond = BRW_CONDITIONAL_Z;
935          break;
936       case nir_op_fne:
937          cond = BRW_CONDITIONAL_NZ;
938          break;
939       default:
940          unreachable("bad opcode");
941       }
942       bld.CMP(dest, op[0], op[1], cond);
943       if (nir_src_bit_size(instr->src[0].src) > 32) {
944          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
945       }
946       break;
947    }
948 
949    case nir_op_ilt:
950    case nir_op_ult:
951       assert(nir_dest_bit_size(instr->dest.dest) < 64);
952       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_L);
953       break;
954 
955    case nir_op_ige:
956    case nir_op_uge:
957       assert(nir_dest_bit_size(instr->dest.dest) < 64);
958       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_GE);
959       break;
960 
961    case nir_op_ieq:
962       assert(nir_dest_bit_size(instr->dest.dest) < 64);
963       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_Z);
964       break;
965 
966    case nir_op_ine:
967       assert(nir_dest_bit_size(instr->dest.dest) < 64);
968       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ);
969       break;
970 
971    case nir_op_inot:
972       assert(nir_dest_bit_size(instr->dest.dest) < 64);
973       if (devinfo->gen >= 8) {
974          op[0] = resolve_source_modifiers(op[0]);
975       }
976       bld.NOT(result, op[0]);
977       break;
978    case nir_op_ixor:
979       assert(nir_dest_bit_size(instr->dest.dest) < 64);
980       if (devinfo->gen >= 8) {
981          op[0] = resolve_source_modifiers(op[0]);
982          op[1] = resolve_source_modifiers(op[1]);
983       }
984       bld.XOR(result, op[0], op[1]);
985       break;
986    case nir_op_ior:
987       assert(nir_dest_bit_size(instr->dest.dest) < 64);
988       if (devinfo->gen >= 8) {
989          op[0] = resolve_source_modifiers(op[0]);
990          op[1] = resolve_source_modifiers(op[1]);
991       }
992       bld.OR(result, op[0], op[1]);
993       break;
994    case nir_op_iand:
995       assert(nir_dest_bit_size(instr->dest.dest) < 64);
996       if (devinfo->gen >= 8) {
997          op[0] = resolve_source_modifiers(op[0]);
998          op[1] = resolve_source_modifiers(op[1]);
999       }
1000       bld.AND(result, op[0], op[1]);
1001       break;
1002 
1003    case nir_op_fdot2:
1004    case nir_op_fdot3:
1005    case nir_op_fdot4:
1006    case nir_op_ball_fequal2:
1007    case nir_op_ball_iequal2:
1008    case nir_op_ball_fequal3:
1009    case nir_op_ball_iequal3:
1010    case nir_op_ball_fequal4:
1011    case nir_op_ball_iequal4:
1012    case nir_op_bany_fnequal2:
1013    case nir_op_bany_inequal2:
1014    case nir_op_bany_fnequal3:
1015    case nir_op_bany_inequal3:
1016    case nir_op_bany_fnequal4:
1017    case nir_op_bany_inequal4:
1018       unreachable("Lowered by nir_lower_alu_reductions");
1019 
1020    case nir_op_fnoise1_1:
1021    case nir_op_fnoise1_2:
1022    case nir_op_fnoise1_3:
1023    case nir_op_fnoise1_4:
1024    case nir_op_fnoise2_1:
1025    case nir_op_fnoise2_2:
1026    case nir_op_fnoise2_3:
1027    case nir_op_fnoise2_4:
1028    case nir_op_fnoise3_1:
1029    case nir_op_fnoise3_2:
1030    case nir_op_fnoise3_3:
1031    case nir_op_fnoise3_4:
1032    case nir_op_fnoise4_1:
1033    case nir_op_fnoise4_2:
1034    case nir_op_fnoise4_3:
1035    case nir_op_fnoise4_4:
1036       unreachable("not reached: should be handled by lower_noise");
1037 
1038    case nir_op_ldexp:
1039       unreachable("not reached: should be handled by ldexp_to_arith()");
1040 
1041    case nir_op_fsqrt:
1042       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1043       inst->saturate = instr->dest.saturate;
1044       break;
1045 
1046    case nir_op_frsq:
1047       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1048       inst->saturate = instr->dest.saturate;
1049       break;
1050 
1051    case nir_op_b2i:
1052    case nir_op_b2f:
1053       bld.MOV(result, negate(op[0]));
1054       break;
1055 
1056    case nir_op_f2b:
1057       bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
1058       break;
1059    case nir_op_d2b: {
1060       /* two-argument instructions can't take 64-bit immediates */
1061       fs_reg zero = vgrf(glsl_type::double_type);
1062       bld.MOV(zero, setup_imm_df(bld, 0.0));
1063       /* A SIMD16 execution needs to be split in two instructions, so use
1064        * a vgrf instead of the flag register as dst so instruction splitting
1065        * works
1066        */
1067       fs_reg tmp = vgrf(glsl_type::double_type);
1068       bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1069       bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1070       break;
1071    }
1072    case nir_op_i2b:
1073       bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1074       break;
1075 
1076    case nir_op_ftrunc:
1077       inst = bld.RNDZ(result, op[0]);
1078       inst->saturate = instr->dest.saturate;
1079       break;
1080 
1081    case nir_op_fceil: {
1082       op[0].negate = !op[0].negate;
1083       fs_reg temp = vgrf(glsl_type::float_type);
1084       bld.RNDD(temp, op[0]);
1085       temp.negate = true;
1086       inst = bld.MOV(result, temp);
1087       inst->saturate = instr->dest.saturate;
1088       break;
1089    }
1090    case nir_op_ffloor:
1091       inst = bld.RNDD(result, op[0]);
1092       inst->saturate = instr->dest.saturate;
1093       break;
1094    case nir_op_ffract:
1095       inst = bld.FRC(result, op[0]);
1096       inst->saturate = instr->dest.saturate;
1097       break;
1098    case nir_op_fround_even:
1099       inst = bld.RNDE(result, op[0]);
1100       inst->saturate = instr->dest.saturate;
1101       break;
1102 
1103    case nir_op_fquantize2f16: {
1104       fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1105       fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1106       fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1107 
1108       /* The destination stride must be at least as big as the source stride. */
1109       tmp16.type = BRW_REGISTER_TYPE_W;
1110       tmp16.stride = 2;
1111 
1112       /* Check for denormal */
1113       fs_reg abs_src0 = op[0];
1114       abs_src0.abs = true;
1115       bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1116               BRW_CONDITIONAL_L);
1117       /* Get the appropriately signed zero */
1118       bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1119               retype(op[0], BRW_REGISTER_TYPE_UD),
1120               brw_imm_ud(0x80000000));
1121       /* Do the actual F32 -> F16 -> F32 conversion */
1122       bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1123       bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1124       /* Select that or zero based on normal status */
1125       inst = bld.SEL(result, zero, tmp32);
1126       inst->predicate = BRW_PREDICATE_NORMAL;
1127       inst->saturate = instr->dest.saturate;
1128       break;
1129    }
1130 
1131    case nir_op_imin:
1132    case nir_op_umin:
1133       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1134    case nir_op_fmin:
1135       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1136       inst->saturate = instr->dest.saturate;
1137       break;
1138 
1139    case nir_op_imax:
1140    case nir_op_umax:
1141       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1142    case nir_op_fmax:
1143       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1144       inst->saturate = instr->dest.saturate;
1145       break;
1146 
1147    case nir_op_pack_snorm_2x16:
1148    case nir_op_pack_snorm_4x8:
1149    case nir_op_pack_unorm_2x16:
1150    case nir_op_pack_unorm_4x8:
1151    case nir_op_unpack_snorm_2x16:
1152    case nir_op_unpack_snorm_4x8:
1153    case nir_op_unpack_unorm_2x16:
1154    case nir_op_unpack_unorm_4x8:
1155    case nir_op_unpack_half_2x16:
1156    case nir_op_pack_half_2x16:
1157       unreachable("not reached: should be handled by lower_packing_builtins");
1158 
1159    case nir_op_unpack_half_2x16_split_x:
1160       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
1161       inst->saturate = instr->dest.saturate;
1162       break;
1163    case nir_op_unpack_half_2x16_split_y:
1164       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
1165       inst->saturate = instr->dest.saturate;
1166       break;
1167 
1168    case nir_op_pack_double_2x32_split:
1169       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1170       break;
1171 
1172    case nir_op_unpack_double_2x32_split_x:
1173    case nir_op_unpack_double_2x32_split_y: {
1174       /* Optimize the common case where we are unpacking from a double we have
1175        * previously packed. In this case we can just bypass the pack operation
1176        * and source directly from its arguments.
1177        */
1178       unsigned index = (instr->op == nir_op_unpack_double_2x32_split_x) ? 0 : 1;
1179       if (instr->src[0].src.is_ssa) {
1180          nir_instr *parent_instr = instr->src[0].src.ssa->parent_instr;
1181          if (parent_instr->type == nir_instr_type_alu) {
1182             nir_alu_instr *alu_parent = nir_instr_as_alu(parent_instr);
1183             if (alu_parent->op == nir_op_pack_double_2x32_split &&
1184                 alu_parent->src[index].src.is_ssa) {
1185                op[0] = retype(get_nir_src(alu_parent->src[index].src),
1186                               BRW_REGISTER_TYPE_UD);
1187                op[0] =
1188                   offset(op[0], bld, alu_parent->src[index].swizzle[channel]);
1189                bld.MOV(result, op[0]);
1190                break;
1191             }
1192          }
1193       }
1194 
1195       if (instr->op == nir_op_unpack_double_2x32_split_x)
1196          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1197       else
1198          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1199       break;
1200    }
1201 
1202    case nir_op_fpow:
1203       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1204       inst->saturate = instr->dest.saturate;
1205       break;
1206 
1207    case nir_op_bitfield_reverse:
1208       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1209       bld.BFREV(result, op[0]);
1210       break;
1211 
1212    case nir_op_bit_count:
1213       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1214       bld.CBIT(result, op[0]);
1215       break;
1216 
1217    case nir_op_ufind_msb: {
1218       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1219       emit_find_msb_using_lzd(bld, result, op[0], false);
1220       break;
1221    }
1222 
1223    case nir_op_ifind_msb: {
1224       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1225 
1226       if (devinfo->gen < 7) {
1227          emit_find_msb_using_lzd(bld, result, op[0], true);
1228       } else {
1229          bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1230 
1231          /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1232           * count from the LSB side. If FBH didn't return an error
1233           * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1234           * count into an LSB count.
1235           */
1236          bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1237 
1238          inst = bld.ADD(result, result, brw_imm_d(31));
1239          inst->predicate = BRW_PREDICATE_NORMAL;
1240          inst->src[0].negate = true;
1241       }
1242       break;
1243    }
1244 
1245    case nir_op_find_lsb:
1246       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1247 
1248       if (devinfo->gen < 7) {
1249          fs_reg temp = vgrf(glsl_type::int_type);
1250 
1251          /* (x & -x) generates a value that consists of only the LSB of x.
1252           * For all powers of 2, findMSB(y) == findLSB(y).
1253           */
1254          fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1255          fs_reg negated_src = src;
1256 
1257          /* One must be negated, and the other must be non-negated.  It
1258           * doesn't matter which is which.
1259           */
1260          negated_src.negate = true;
1261          src.negate = false;
1262 
1263          bld.AND(temp, src, negated_src);
1264          emit_find_msb_using_lzd(bld, result, temp, false);
1265       } else {
1266          bld.FBL(result, op[0]);
1267       }
1268       break;
1269 
1270    case nir_op_ubitfield_extract:
1271    case nir_op_ibitfield_extract:
1272       unreachable("should have been lowered");
1273    case nir_op_ubfe:
1274    case nir_op_ibfe:
1275       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1276       bld.BFE(result, op[2], op[1], op[0]);
1277       break;
1278    case nir_op_bfm:
1279       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1280       bld.BFI1(result, op[0], op[1]);
1281       break;
1282    case nir_op_bfi:
1283       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1284       bld.BFI2(result, op[0], op[1], op[2]);
1285       break;
1286 
1287    case nir_op_bitfield_insert:
1288       unreachable("not reached: should have been lowered");
1289 
1290    case nir_op_ishl:
1291       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1292       bld.SHL(result, op[0], op[1]);
1293       break;
1294    case nir_op_ishr:
1295       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1296       bld.ASR(result, op[0], op[1]);
1297       break;
1298    case nir_op_ushr:
1299       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1300       bld.SHR(result, op[0], op[1]);
1301       break;
1302 
1303    case nir_op_pack_half_2x16_split:
1304       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1305       break;
1306 
1307    case nir_op_ffma:
1308       inst = bld.MAD(result, op[2], op[1], op[0]);
1309       inst->saturate = instr->dest.saturate;
1310       break;
1311 
1312    case nir_op_flrp:
1313       inst = bld.LRP(result, op[0], op[1], op[2]);
1314       inst->saturate = instr->dest.saturate;
1315       break;
1316 
1317    case nir_op_bcsel:
1318       if (optimize_frontfacing_ternary(instr, result))
1319          return;
1320 
1321       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1322       inst = bld.SEL(result, op[1], op[2]);
1323       inst->predicate = BRW_PREDICATE_NORMAL;
1324       break;
1325 
1326    case nir_op_extract_u8:
1327    case nir_op_extract_i8: {
1328       const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1329       nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
1330       assert(byte != NULL);
1331       bld.MOV(result, subscript(op[0], type, byte->u32[0]));
1332       break;
1333    }
1334 
1335    case nir_op_extract_u16:
1336    case nir_op_extract_i16: {
1337       const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1338       nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
1339       assert(word != NULL);
1340       bld.MOV(result, subscript(op[0], type, word->u32[0]));
1341       break;
1342    }
1343 
1344    default:
1345       unreachable("unhandled instruction");
1346    }
1347 
1348    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1349     * to sign extend the low bit to 0/~0
1350     */
1351    if (devinfo->gen <= 5 &&
1352        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1353       fs_reg masked = vgrf(glsl_type::int_type);
1354       bld.AND(masked, result, brw_imm_d(1));
1355       masked.negate = true;
1356       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1357    }
1358 }
1359 
1360 void
nir_emit_load_const(const fs_builder & bld,nir_load_const_instr * instr)1361 fs_visitor::nir_emit_load_const(const fs_builder &bld,
1362                                 nir_load_const_instr *instr)
1363 {
1364    const brw_reg_type reg_type =
1365       instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
1366    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1367 
1368    switch (instr->def.bit_size) {
1369    case 32:
1370       for (unsigned i = 0; i < instr->def.num_components; i++)
1371          bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
1372       break;
1373 
1374    case 64:
1375       for (unsigned i = 0; i < instr->def.num_components; i++)
1376          bld.MOV(offset(reg, bld, i),
1377                  setup_imm_df(bld, instr->value.f64[i]));
1378       break;
1379 
1380    default:
1381       unreachable("Invalid bit size");
1382    }
1383 
1384    nir_ssa_values[instr->def.index] = reg;
1385 }
1386 
1387 fs_reg
get_nir_src(const nir_src & src)1388 fs_visitor::get_nir_src(const nir_src &src)
1389 {
1390    fs_reg reg;
1391    if (src.is_ssa) {
1392       if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1393          const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
1394             BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
1395          reg = bld.vgrf(reg_type, src.ssa->num_components);
1396       } else {
1397          reg = nir_ssa_values[src.ssa->index];
1398       }
1399    } else {
1400       /* We don't handle indirects on locals */
1401       assert(src.reg.indirect == NULL);
1402       reg = offset(nir_locals[src.reg.reg->index], bld,
1403                    src.reg.base_offset * src.reg.reg->num_components);
1404    }
1405 
1406    /* to avoid floating-point denorm flushing problems, set the type by
1407     * default to D - instructions that need floating point semantics will set
1408     * this to F if they need to
1409     */
1410    return retype(reg, BRW_REGISTER_TYPE_D);
1411 }
1412 
1413 /**
1414  * Return an IMM for constants; otherwise call get_nir_src() as normal.
1415  */
1416 fs_reg
get_nir_src_imm(const nir_src & src)1417 fs_visitor::get_nir_src_imm(const nir_src &src)
1418 {
1419    nir_const_value *val = nir_src_as_const_value(src);
1420    return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
1421 }
1422 
1423 fs_reg
get_nir_dest(const nir_dest & dest)1424 fs_visitor::get_nir_dest(const nir_dest &dest)
1425 {
1426    if (dest.is_ssa) {
1427       const brw_reg_type reg_type =
1428          dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
1429       nir_ssa_values[dest.ssa.index] =
1430          bld.vgrf(reg_type, dest.ssa.num_components);
1431       return nir_ssa_values[dest.ssa.index];
1432    } else {
1433       /* We don't handle indirects on locals */
1434       assert(dest.reg.indirect == NULL);
1435       return offset(nir_locals[dest.reg.reg->index], bld,
1436                     dest.reg.base_offset * dest.reg.reg->num_components);
1437    }
1438 }
1439 
1440 fs_reg
get_nir_image_deref(const nir_deref_var * deref)1441 fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
1442 {
1443    fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
1444                 BRW_REGISTER_TYPE_UD);
1445    fs_reg indirect;
1446    unsigned indirect_max = 0;
1447 
1448    for (const nir_deref *tail = &deref->deref; tail->child;
1449         tail = tail->child) {
1450       const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
1451       assert(tail->child->deref_type == nir_deref_type_array);
1452       const unsigned size = glsl_get_length(tail->type);
1453       const unsigned element_size = type_size_scalar(deref_array->deref.type);
1454       const unsigned base = MIN2(deref_array->base_offset, size - 1);
1455       image = offset(image, bld, base * element_size);
1456 
1457       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
1458          fs_reg tmp = vgrf(glsl_type::uint_type);
1459 
1460          /* Accessing an invalid surface index with the dataport can result
1461           * in a hang.  According to the spec "if the index used to
1462           * select an individual element is negative or greater than or
1463           * equal to the size of the array, the results of the operation
1464           * are undefined but may not lead to termination" -- which is one
1465           * of the possible outcomes of the hang.  Clamp the index to
1466           * prevent access outside of the array bounds.
1467           */
1468          bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
1469                                      BRW_REGISTER_TYPE_UD),
1470                          brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L);
1471 
1472          indirect_max += element_size * (tail->type->length - 1);
1473 
1474          bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
1475          if (indirect.file == BAD_FILE) {
1476             indirect = tmp;
1477          } else {
1478             bld.ADD(indirect, indirect, tmp);
1479          }
1480       }
1481    }
1482 
1483    if (indirect.file == BAD_FILE) {
1484       return image;
1485    } else {
1486       /* Emit a pile of MOVs to load the uniform into a temporary.  The
1487        * dead-code elimination pass will get rid of what we don't use.
1488        */
1489       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE);
1490       for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
1491          bld.emit(SHADER_OPCODE_MOV_INDIRECT,
1492                   offset(tmp, bld, j), offset(image, bld, j),
1493                   indirect, brw_imm_ud((indirect_max + 1) * 4));
1494       }
1495       return tmp;
1496    }
1497 }
1498 
1499 void
emit_percomp(const fs_builder & bld,const fs_inst & inst,unsigned wr_mask)1500 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1501                          unsigned wr_mask)
1502 {
1503    for (unsigned i = 0; i < 4; i++) {
1504       if (!((wr_mask >> i) & 1))
1505          continue;
1506 
1507       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1508       new_inst->dst = offset(new_inst->dst, bld, i);
1509       for (unsigned j = 0; j < new_inst->sources; j++)
1510          if (new_inst->src[j].file == VGRF)
1511             new_inst->src[j] = offset(new_inst->src[j], bld, i);
1512 
1513       bld.emit(new_inst);
1514    }
1515 }
1516 
1517 /**
1518  * Get the matching channel register datatype for an image intrinsic of the
1519  * specified GLSL image type.
1520  */
1521 static brw_reg_type
get_image_base_type(const glsl_type * type)1522 get_image_base_type(const glsl_type *type)
1523 {
1524    switch ((glsl_base_type)type->sampled_type) {
1525    case GLSL_TYPE_UINT:
1526       return BRW_REGISTER_TYPE_UD;
1527    case GLSL_TYPE_INT:
1528       return BRW_REGISTER_TYPE_D;
1529    case GLSL_TYPE_FLOAT:
1530       return BRW_REGISTER_TYPE_F;
1531    default:
1532       unreachable("Not reached.");
1533    }
1534 }
1535 
1536 /**
1537  * Get the appropriate atomic op for an image atomic intrinsic.
1538  */
1539 static unsigned
get_image_atomic_op(nir_intrinsic_op op,const glsl_type * type)1540 get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
1541 {
1542    switch (op) {
1543    case nir_intrinsic_image_atomic_add:
1544       return BRW_AOP_ADD;
1545    case nir_intrinsic_image_atomic_min:
1546       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1547               BRW_AOP_IMIN : BRW_AOP_UMIN);
1548    case nir_intrinsic_image_atomic_max:
1549       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1550               BRW_AOP_IMAX : BRW_AOP_UMAX);
1551    case nir_intrinsic_image_atomic_and:
1552       return BRW_AOP_AND;
1553    case nir_intrinsic_image_atomic_or:
1554       return BRW_AOP_OR;
1555    case nir_intrinsic_image_atomic_xor:
1556       return BRW_AOP_XOR;
1557    case nir_intrinsic_image_atomic_exchange:
1558       return BRW_AOP_MOV;
1559    case nir_intrinsic_image_atomic_comp_swap:
1560       return BRW_AOP_CMPWR;
1561    default:
1562       unreachable("Not reachable.");
1563    }
1564 }
1565 
1566 static fs_inst *
emit_pixel_interpolater_send(const fs_builder & bld,enum opcode opcode,const fs_reg & dst,const fs_reg & src,const fs_reg & desc,glsl_interp_mode interpolation)1567 emit_pixel_interpolater_send(const fs_builder &bld,
1568                              enum opcode opcode,
1569                              const fs_reg &dst,
1570                              const fs_reg &src,
1571                              const fs_reg &desc,
1572                              glsl_interp_mode interpolation)
1573 {
1574    struct brw_wm_prog_data *wm_prog_data =
1575       brw_wm_prog_data(bld.shader->stage_prog_data);
1576    fs_inst *inst;
1577    fs_reg payload;
1578    int mlen;
1579 
1580    if (src.file == BAD_FILE) {
1581       /* Dummy payload */
1582       payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
1583       mlen = 1;
1584    } else {
1585       payload = src;
1586       mlen = 2 * bld.dispatch_width() / 8;
1587    }
1588 
1589    inst = bld.emit(opcode, dst, payload, desc);
1590    inst->mlen = mlen;
1591    /* 2 floats per slot returned */
1592    inst->size_written = 2 * dst.component_size(inst->exec_size);
1593    inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
1594 
1595    wm_prog_data->pulls_bary = true;
1596 
1597    return inst;
1598 }
1599 
1600 /**
1601  * Computes 1 << x, given a D/UD register containing some value x.
1602  */
1603 static fs_reg
intexp2(const fs_builder & bld,const fs_reg & x)1604 intexp2(const fs_builder &bld, const fs_reg &x)
1605 {
1606    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1607 
1608    fs_reg result = bld.vgrf(x.type, 1);
1609    fs_reg one = bld.vgrf(x.type, 1);
1610 
1611    bld.MOV(one, retype(brw_imm_d(1), one.type));
1612    bld.SHL(result, one, x);
1613    return result;
1614 }
1615 
1616 void
emit_gs_end_primitive(const nir_src & vertex_count_nir_src)1617 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
1618 {
1619    assert(stage == MESA_SHADER_GEOMETRY);
1620 
1621    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1622 
1623    if (gs_compile->control_data_header_size_bits == 0)
1624       return;
1625 
1626    /* We can only do EndPrimitive() functionality when the control data
1627     * consists of cut bits.  Fortunately, the only time it isn't is when the
1628     * output type is points, in which case EndPrimitive() is a no-op.
1629     */
1630    if (gs_prog_data->control_data_format !=
1631        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
1632       return;
1633    }
1634 
1635    /* Cut bits use one bit per vertex. */
1636    assert(gs_compile->control_data_bits_per_vertex == 1);
1637 
1638    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1639    vertex_count.type = BRW_REGISTER_TYPE_UD;
1640 
1641    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
1642     * vertex n, 0 otherwise.  So all we need to do here is mark bit
1643     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
1644     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
1645     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
1646     *
1647     * Note that if EndPrimitive() is called before emitting any vertices, this
1648     * will cause us to set bit 31 of the control_data_bits register to 1.
1649     * That's fine because:
1650     *
1651     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
1652     *   output, so the hardware will ignore cut bit 31.
1653     *
1654     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
1655     *   last vertex, so setting cut bit 31 has no effect (since the primitive
1656     *   is automatically ended when the GS terminates).
1657     *
1658     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
1659     *   control_data_bits register to 0 when the first vertex is emitted.
1660     */
1661 
1662    const fs_builder abld = bld.annotate("end primitive");
1663 
1664    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
1665    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1666    abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1667    fs_reg mask = intexp2(abld, prev_count);
1668    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1669     * attention to the lower 5 bits of its second source argument, so on this
1670     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
1671     * ((vertex_count - 1) % 32).
1672     */
1673    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1674 }
1675 
1676 void
emit_gs_control_data_bits(const fs_reg & vertex_count)1677 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
1678 {
1679    assert(stage == MESA_SHADER_GEOMETRY);
1680    assert(gs_compile->control_data_bits_per_vertex != 0);
1681 
1682    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1683 
1684    const fs_builder abld = bld.annotate("emit control data bits");
1685    const fs_builder fwa_bld = bld.exec_all();
1686 
1687    /* We use a single UD register to accumulate control data bits (32 bits
1688     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
1689     * at a time.
1690     *
1691     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
1692     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
1693     * use the Channel Mask phase to enable/disable which DWord within that
1694     * group to write.  (Remember, different SIMD8 channels may have emitted
1695     * different numbers of vertices, so we may need per-slot offsets.)
1696     *
1697     * Channel masking presents an annoying problem: we may have to replicate
1698     * the data up to 4 times:
1699     *
1700     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
1701     *
1702     * To avoid penalizing shaders that emit a small number of vertices, we
1703     * can avoid these sometimes: if the size of the control data header is
1704     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
1705     * land in the same 128-bit group, so we can skip per-slot offsets.
1706     *
1707     * Similarly, if the control data header is <= 32 bits, there is only one
1708     * DWord, so we can skip channel masks.
1709     */
1710    enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
1711 
1712    fs_reg channel_mask, per_slot_offset;
1713 
1714    if (gs_compile->control_data_header_size_bits > 32) {
1715       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
1716       channel_mask = vgrf(glsl_type::uint_type);
1717    }
1718 
1719    if (gs_compile->control_data_header_size_bits > 128) {
1720       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
1721       per_slot_offset = vgrf(glsl_type::uint_type);
1722    }
1723 
1724    /* Figure out which DWord we're trying to write to using the formula:
1725     *
1726     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
1727     *
1728     * Since bits_per_vertex is a power of two, and is known at compile
1729     * time, this can be optimized to:
1730     *
1731     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
1732     */
1733    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
1734       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1735       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1736       abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1737       unsigned log2_bits_per_vertex =
1738          util_last_bit(gs_compile->control_data_bits_per_vertex);
1739       abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
1740 
1741       if (per_slot_offset.file != BAD_FILE) {
1742          /* Set the per-slot offset to dword_index / 4, so that we'll write to
1743           * the appropriate OWord within the control data header.
1744           */
1745          abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
1746       }
1747 
1748       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
1749        * write to the appropriate DWORD within the OWORD.
1750        */
1751       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1752       fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
1753       channel_mask = intexp2(fwa_bld, channel);
1754       /* Then the channel masks need to be in bits 23:16. */
1755       fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
1756    }
1757 
1758    /* Store the control data bits in the message payload and send it. */
1759    int mlen = 2;
1760    if (channel_mask.file != BAD_FILE)
1761       mlen += 4; /* channel masks, plus 3 extra copies of the data */
1762    if (per_slot_offset.file != BAD_FILE)
1763       mlen++;
1764 
1765    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
1766    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
1767    int i = 0;
1768    sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1769    if (per_slot_offset.file != BAD_FILE)
1770       sources[i++] = per_slot_offset;
1771    if (channel_mask.file != BAD_FILE)
1772       sources[i++] = channel_mask;
1773    while (i < mlen) {
1774       sources[i++] = this->control_data_bits;
1775    }
1776 
1777    abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
1778    fs_inst *inst = abld.emit(opcode, reg_undef, payload);
1779    inst->mlen = mlen;
1780    /* We need to increment Global Offset by 256-bits to make room for
1781     * Broadwell's extra "Vertex Count" payload at the beginning of the
1782     * URB entry.  Since this is an OWord message, Global Offset is counted
1783     * in 128-bit units, so we must set it to 2.
1784     */
1785    if (gs_prog_data->static_vertex_count == -1)
1786       inst->offset = 2;
1787 }
1788 
1789 void
set_gs_stream_control_data_bits(const fs_reg & vertex_count,unsigned stream_id)1790 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
1791                                             unsigned stream_id)
1792 {
1793    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
1794 
1795    /* Note: we are calling this *before* increasing vertex_count, so
1796     * this->vertex_count == vertex_count - 1 in the formula above.
1797     */
1798 
1799    /* Stream mode uses 2 bits per vertex */
1800    assert(gs_compile->control_data_bits_per_vertex == 2);
1801 
1802    /* Must be a valid stream */
1803    assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
1804 
1805    /* Control data bits are initialized to 0 so we don't have to set any
1806     * bits when sending vertices to stream 0.
1807     */
1808    if (stream_id == 0)
1809       return;
1810 
1811    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
1812 
1813    /* reg::sid = stream_id */
1814    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1815    abld.MOV(sid, brw_imm_ud(stream_id));
1816 
1817    /* reg:shift_count = 2 * (vertex_count - 1) */
1818    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1819    abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
1820 
1821    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1822     * attention to the lower 5 bits of its second source argument, so on this
1823     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
1824     * stream_id << ((2 * (vertex_count - 1)) % 32).
1825     */
1826    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1827    abld.SHL(mask, sid, shift_count);
1828    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1829 }
1830 
1831 void
emit_gs_vertex(const nir_src & vertex_count_nir_src,unsigned stream_id)1832 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
1833                            unsigned stream_id)
1834 {
1835    assert(stage == MESA_SHADER_GEOMETRY);
1836 
1837    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1838 
1839    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1840    vertex_count.type = BRW_REGISTER_TYPE_UD;
1841 
1842    /* Haswell and later hardware ignores the "Render Stream Select" bits
1843     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
1844     * and instead sends all primitives down the pipeline for rasterization.
1845     * If the SOL stage is enabled, "Render Stream Select" is honored and
1846     * primitives bound to non-zero streams are discarded after stream output.
1847     *
1848     * Since the only purpose of primives sent to non-zero streams is to
1849     * be recorded by transform feedback, we can simply discard all geometry
1850     * bound to these streams when transform feedback is disabled.
1851     */
1852    if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
1853       return;
1854 
1855    /* If we're outputting 32 control data bits or less, then we can wait
1856     * until the shader is over to output them all.  Otherwise we need to
1857     * output them as we go.  Now is the time to do it, since we're about to
1858     * output the vertex_count'th vertex, so it's guaranteed that the
1859     * control data bits associated with the (vertex_count - 1)th vertex are
1860     * correct.
1861     */
1862    if (gs_compile->control_data_header_size_bits > 32) {
1863       const fs_builder abld =
1864          bld.annotate("emit vertex: emit control data bits");
1865 
1866       /* Only emit control data bits if we've finished accumulating a batch
1867        * of 32 bits.  This is the case when:
1868        *
1869        *     (vertex_count * bits_per_vertex) % 32 == 0
1870        *
1871        * (in other words, when the last 5 bits of vertex_count *
1872        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
1873        * integer n (which is always the case, since bits_per_vertex is
1874        * always 1 or 2), this is equivalent to requiring that the last 5-n
1875        * bits of vertex_count are 0:
1876        *
1877        *     vertex_count & (2^(5-n) - 1) == 0
1878        *
1879        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
1880        * equivalent to:
1881        *
1882        *     vertex_count & (32 / bits_per_vertex - 1) == 0
1883        *
1884        * TODO: If vertex_count is an immediate, we could do some of this math
1885        *       at compile time...
1886        */
1887       fs_inst *inst =
1888          abld.AND(bld.null_reg_d(), vertex_count,
1889                   brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
1890       inst->conditional_mod = BRW_CONDITIONAL_Z;
1891 
1892       abld.IF(BRW_PREDICATE_NORMAL);
1893       /* If vertex_count is 0, then no control data bits have been
1894        * accumulated yet, so we can skip emitting them.
1895        */
1896       abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
1897                BRW_CONDITIONAL_NEQ);
1898       abld.IF(BRW_PREDICATE_NORMAL);
1899       emit_gs_control_data_bits(vertex_count);
1900       abld.emit(BRW_OPCODE_ENDIF);
1901 
1902       /* Reset control_data_bits to 0 so we can start accumulating a new
1903        * batch.
1904        *
1905        * Note: in the case where vertex_count == 0, this neutralizes the
1906        * effect of any call to EndPrimitive() that the shader may have
1907        * made before outputting its first vertex.
1908        */
1909       inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
1910       inst->force_writemask_all = true;
1911       abld.emit(BRW_OPCODE_ENDIF);
1912    }
1913 
1914    emit_urb_writes(vertex_count);
1915 
1916    /* In stream mode we have to set control data bits for all vertices
1917     * unless we have disabled control data bits completely (which we do
1918     * do for GL_POINTS outputs that don't use streams).
1919     */
1920    if (gs_compile->control_data_header_size_bits > 0 &&
1921        gs_prog_data->control_data_format ==
1922           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
1923       set_gs_stream_control_data_bits(vertex_count, stream_id);
1924    }
1925 }
1926 
1927 void
emit_gs_input_load(const fs_reg & dst,const nir_src & vertex_src,unsigned base_offset,const nir_src & offset_src,unsigned num_components,unsigned first_component)1928 fs_visitor::emit_gs_input_load(const fs_reg &dst,
1929                                const nir_src &vertex_src,
1930                                unsigned base_offset,
1931                                const nir_src &offset_src,
1932                                unsigned num_components,
1933                                unsigned first_component)
1934 {
1935    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1936 
1937    nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
1938    nir_const_value *offset_const = nir_src_as_const_value(offset_src);
1939    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
1940 
1941    /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
1942     * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
1943     * gl_PointSize is available as a GS input, however, so it must be that.
1944     */
1945    const bool is_point_size = (base_offset == 0);
1946 
1947    /* TODO: figure out push input layout for invocations == 1 */
1948    if (gs_prog_data->invocations == 1 &&
1949        offset_const != NULL && vertex_const != NULL &&
1950        4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
1951       int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
1952                        vertex_const->u32[0] * push_reg_count;
1953       /* This input was pushed into registers. */
1954       if (is_point_size) {
1955          /* gl_PointSize comes in .w */
1956          bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
1957       } else {
1958          for (unsigned i = 0; i < num_components; i++) {
1959             bld.MOV(offset(dst, bld, i),
1960                     fs_reg(ATTR, imm_offset + i + first_component, dst.type));
1961          }
1962       }
1963       return;
1964    }
1965 
1966    /* Resort to the pull model.  Ensure the VUE handles are provided. */
1967    gs_prog_data->base.include_vue_handles = true;
1968 
1969    unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
1970    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1971 
1972    if (gs_prog_data->invocations == 1) {
1973       if (vertex_const) {
1974          /* The vertex index is constant; just select the proper URB handle. */
1975          icp_handle =
1976             retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0),
1977                    BRW_REGISTER_TYPE_UD);
1978       } else {
1979          /* The vertex index is non-constant.  We need to use indirect
1980           * addressing to fetch the proper URB handle.
1981           *
1982           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
1983           * indicating that channel <n> should read the handle from
1984           * DWord <n>.  We convert that to bytes by multiplying by 4.
1985           *
1986           * Next, we convert the vertex index to bytes by multiplying
1987           * by 32 (shifting by 5), and add the two together.  This is
1988           * the final indirect byte offset.
1989           */
1990          fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
1991          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1992          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1993          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1994 
1995          /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
1996          bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
1997          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
1998          bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
1999          /* Convert vertex_index to bytes (multiply by 32) */
2000          bld.SHL(vertex_offset_bytes,
2001                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2002                  brw_imm_ud(5u));
2003          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2004 
2005          /* Use first_icp_handle as the base offset.  There is one register
2006           * of URB handles per vertex, so inform the register allocator that
2007           * we might read up to nir->info->gs.vertices_in registers.
2008           */
2009          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2010                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2011                   fs_reg(icp_offset_bytes),
2012                   brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE));
2013       }
2014    } else {
2015       assert(gs_prog_data->invocations > 1);
2016 
2017       if (vertex_const) {
2018          assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5);
2019          bld.MOV(icp_handle,
2020                  retype(brw_vec1_grf(first_icp_handle +
2021                                      vertex_const->i32[0] / 8,
2022                                      vertex_const->i32[0] % 8),
2023                         BRW_REGISTER_TYPE_UD));
2024       } else {
2025          /* The vertex index is non-constant.  We need to use indirect
2026           * addressing to fetch the proper URB handle.
2027           *
2028           */
2029          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2030 
2031          /* Convert vertex_index to bytes (multiply by 4) */
2032          bld.SHL(icp_offset_bytes,
2033                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2034                  brw_imm_ud(2u));
2035 
2036          /* Use first_icp_handle as the base offset.  There is one DWord
2037           * of URB handles per vertex, so inform the register allocator that
2038           * we might read up to ceil(nir->info->gs.vertices_in / 8) registers.
2039           */
2040          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2041                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2042                   fs_reg(icp_offset_bytes),
2043                   brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) *
2044                              REG_SIZE));
2045       }
2046    }
2047 
2048    fs_inst *inst;
2049 
2050    fs_reg tmp_dst = dst;
2051    fs_reg indirect_offset = get_nir_src(offset_src);
2052    unsigned num_iterations = 1;
2053    unsigned orig_num_components = num_components;
2054 
2055    if (type_sz(dst.type) == 8) {
2056       if (num_components > 2) {
2057          num_iterations = 2;
2058          num_components = 2;
2059       }
2060       fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2061       tmp_dst = tmp;
2062       first_component = first_component / 2;
2063    }
2064 
2065    for (unsigned iter = 0; iter < num_iterations; iter++) {
2066       if (offset_const) {
2067          /* Constant indexing - use global offset. */
2068          if (first_component != 0) {
2069             unsigned read_components = num_components + first_component;
2070             fs_reg tmp = bld.vgrf(dst.type, read_components);
2071             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2072             inst->size_written = read_components *
2073                                  tmp.component_size(inst->exec_size);
2074             for (unsigned i = 0; i < num_components; i++) {
2075                bld.MOV(offset(tmp_dst, bld, i),
2076                        offset(tmp, bld, i + first_component));
2077             }
2078          } else {
2079             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
2080                             icp_handle);
2081             inst->size_written = num_components *
2082                                  tmp_dst.component_size(inst->exec_size);
2083          }
2084          inst->offset = base_offset + offset_const->u32[0];
2085          inst->mlen = 1;
2086       } else {
2087          /* Indirect indexing - use per-slot offsets as well. */
2088          const fs_reg srcs[] = { icp_handle, indirect_offset };
2089          unsigned read_components = num_components + first_component;
2090          fs_reg tmp = bld.vgrf(dst.type, read_components);
2091          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2092          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2093          if (first_component != 0) {
2094             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2095                             payload);
2096             inst->size_written = read_components *
2097                                  tmp.component_size(inst->exec_size);
2098             for (unsigned i = 0; i < num_components; i++) {
2099                bld.MOV(offset(tmp_dst, bld, i),
2100                        offset(tmp, bld, i + first_component));
2101             }
2102          } else {
2103             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
2104                          payload);
2105             inst->size_written = num_components *
2106                                  tmp_dst.component_size(inst->exec_size);
2107          }
2108          inst->offset = base_offset;
2109          inst->mlen = 2;
2110       }
2111 
2112       if (type_sz(dst.type) == 8) {
2113          shuffle_32bit_load_result_to_64bit_data(
2114             bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
2115 
2116          for (unsigned c = 0; c < num_components; c++)
2117             bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
2118       }
2119 
2120       if (num_iterations > 1) {
2121          num_components = orig_num_components - 2;
2122          if(offset_const) {
2123             base_offset++;
2124          } else {
2125             fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2126             bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
2127             indirect_offset = new_indirect;
2128          }
2129       }
2130    }
2131 
2132    if (is_point_size) {
2133       /* Read the whole VUE header (because of alignment) and read .w. */
2134       fs_reg tmp = bld.vgrf(dst.type, 4);
2135       inst->dst = tmp;
2136       inst->size_written = 4 * REG_SIZE;
2137       bld.MOV(dst, offset(tmp, bld, 3));
2138    }
2139 }
2140 
2141 fs_reg
get_indirect_offset(nir_intrinsic_instr * instr)2142 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2143 {
2144    nir_src *offset_src = nir_get_io_offset_src(instr);
2145    nir_const_value *const_value = nir_src_as_const_value(*offset_src);
2146 
2147    if (const_value) {
2148       /* The only constant offset we should find is 0.  brw_nir.c's
2149        * add_const_offset_to_base() will fold other constant offsets
2150        * into instr->const_index[0].
2151        */
2152       assert(const_value->u32[0] == 0);
2153       return fs_reg();
2154    }
2155 
2156    return get_nir_src(*offset_src);
2157 }
2158 
2159 static void
do_untyped_vector_read(const fs_builder & bld,const fs_reg dest,const fs_reg surf_index,const fs_reg offset_reg,unsigned num_components)2160 do_untyped_vector_read(const fs_builder &bld,
2161                        const fs_reg dest,
2162                        const fs_reg surf_index,
2163                        const fs_reg offset_reg,
2164                        unsigned num_components)
2165 {
2166    if (type_sz(dest.type) == 4) {
2167       fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
2168                                              1 /* dims */,
2169                                              num_components,
2170                                              BRW_PREDICATE_NONE);
2171       read_result.type = dest.type;
2172       for (unsigned i = 0; i < num_components; i++)
2173          bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
2174    } else if (type_sz(dest.type) == 8) {
2175       /* Reading a dvec, so we need to:
2176        *
2177        * 1. Multiply num_components by 2, to account for the fact that we
2178        *    need to read 64-bit components.
2179        * 2. Shuffle the result of the load to form valid 64-bit elements
2180        * 3. Emit a second load (for components z/w) if needed.
2181        */
2182       fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2183       bld.MOV(read_offset, offset_reg);
2184 
2185       int iters = num_components <= 2 ? 1 : 2;
2186 
2187       /* Load the dvec, the first iteration loads components x/y, the second
2188        * iteration, if needed, loads components z/w
2189        */
2190       for (int it = 0; it < iters; it++) {
2191          /* Compute number of components to read in this iteration */
2192          int iter_components = MIN2(2, num_components);
2193          num_components -= iter_components;
2194 
2195          /* Read. Since this message reads 32-bit components, we need to
2196           * read twice as many components.
2197           */
2198          fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset,
2199                                                 1 /* dims */,
2200                                                 iter_components * 2,
2201                                                 BRW_PREDICATE_NONE);
2202 
2203          /* Shuffle the 32-bit load result into valid 64-bit data */
2204          const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
2205          shuffle_32bit_load_result_to_64bit_data(
2206             bld, packed_result, read_result, iter_components);
2207 
2208          /* Move each component to its destination */
2209          read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
2210          for (int c = 0; c < iter_components; c++) {
2211             bld.MOV(offset(dest, bld, it * 2 + c),
2212                     offset(packed_result, bld, c));
2213          }
2214 
2215          bld.ADD(read_offset, read_offset, brw_imm_ud(16));
2216       }
2217    } else {
2218       unreachable("Unsupported type");
2219    }
2220 }
2221 
2222 void
nir_emit_vs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)2223 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2224                                   nir_intrinsic_instr *instr)
2225 {
2226    assert(stage == MESA_SHADER_VERTEX);
2227 
2228    fs_reg dest;
2229    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2230       dest = get_nir_dest(instr->dest);
2231 
2232    switch (instr->intrinsic) {
2233    case nir_intrinsic_load_vertex_id:
2234       unreachable("should be lowered by lower_vertex_id()");
2235 
2236    case nir_intrinsic_load_vertex_id_zero_base:
2237    case nir_intrinsic_load_base_vertex:
2238    case nir_intrinsic_load_instance_id:
2239    case nir_intrinsic_load_base_instance:
2240    case nir_intrinsic_load_draw_id: {
2241       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
2242       fs_reg val = nir_system_values[sv];
2243       assert(val.file != BAD_FILE);
2244       dest.type = val.type;
2245       bld.MOV(dest, val);
2246       break;
2247    }
2248 
2249    case nir_intrinsic_load_input: {
2250       fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
2251       unsigned first_component = nir_intrinsic_component(instr);
2252       unsigned num_components = instr->num_components;
2253       enum brw_reg_type type = dest.type;
2254 
2255       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
2256       assert(const_offset && "Indirect input loads not allowed");
2257       src = offset(src, bld, const_offset->u32[0]);
2258 
2259       for (unsigned j = 0; j < num_components; j++) {
2260          bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
2261       }
2262 
2263       if (type == BRW_REGISTER_TYPE_DF) {
2264          /* Once the double vector is read, set again its original register
2265           * type to continue with normal execution.
2266           */
2267          src = retype(src, type);
2268          dest = retype(dest, type);
2269       }
2270 
2271       if (type_sz(src.type) == 8) {
2272          shuffle_32bit_load_result_to_64bit_data(bld,
2273                                                  dest,
2274                                                  retype(dest, BRW_REGISTER_TYPE_F),
2275                                                  instr->num_components);
2276       }
2277       break;
2278    }
2279 
2280    default:
2281       nir_emit_intrinsic(bld, instr);
2282       break;
2283    }
2284 }
2285 
2286 void
nir_emit_tcs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)2287 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2288                                    nir_intrinsic_instr *instr)
2289 {
2290    assert(stage == MESA_SHADER_TESS_CTRL);
2291    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2292    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2293 
2294    fs_reg dst;
2295    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2296       dst = get_nir_dest(instr->dest);
2297 
2298    switch (instr->intrinsic) {
2299    case nir_intrinsic_load_primitive_id:
2300       bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
2301       break;
2302    case nir_intrinsic_load_invocation_id:
2303       bld.MOV(retype(dst, invocation_id.type), invocation_id);
2304       break;
2305    case nir_intrinsic_load_patch_vertices_in:
2306       bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2307               brw_imm_d(tcs_key->input_vertices));
2308       break;
2309 
2310    case nir_intrinsic_barrier: {
2311       if (tcs_prog_data->instances == 1)
2312          break;
2313 
2314       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2315       fs_reg m0_2 = component(m0, 2);
2316 
2317       const fs_builder chanbld = bld.exec_all().group(1, 0);
2318 
2319       /* Zero the message header */
2320       bld.exec_all().MOV(m0, brw_imm_ud(0u));
2321 
2322       /* Copy "Barrier ID" from r0.2, bits 16:13 */
2323       chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2324                   brw_imm_ud(INTEL_MASK(16, 13)));
2325 
2326       /* Shift it up to bits 27:24. */
2327       chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2328 
2329       /* Set the Barrier Count and the enable bit */
2330       chanbld.OR(m0_2, m0_2,
2331                  brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2332 
2333       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2334       break;
2335    }
2336 
2337    case nir_intrinsic_load_input:
2338       unreachable("nir_lower_io should never give us these.");
2339       break;
2340 
2341    case nir_intrinsic_load_per_vertex_input: {
2342       fs_reg indirect_offset = get_indirect_offset(instr);
2343       unsigned imm_offset = instr->const_index[0];
2344 
2345       const nir_src &vertex_src = instr->src[0];
2346       nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
2347 
2348       fs_inst *inst;
2349 
2350       fs_reg icp_handle;
2351 
2352       if (vertex_const) {
2353          /* Emit a MOV to resolve <0,1,0> regioning. */
2354          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2355          bld.MOV(icp_handle,
2356                  retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
2357                                      vertex_const->i32[0] & 7),
2358                         BRW_REGISTER_TYPE_UD));
2359       } else if (tcs_prog_data->instances == 1 &&
2360                  vertex_src.is_ssa &&
2361                  vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
2362                  nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
2363          /* For the common case of only 1 instance, an array index of
2364           * gl_InvocationID means reading g1.  Skip all the indirect work.
2365           */
2366          icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2367       } else {
2368          /* The vertex index is non-constant.  We need to use indirect
2369           * addressing to fetch the proper URB handle.
2370           */
2371          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2372 
2373          /* Each ICP handle is a single DWord (4 bytes) */
2374          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2375          bld.SHL(vertex_offset_bytes,
2376                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2377                  brw_imm_ud(2u));
2378 
2379          /* Start at g1.  We might read up to 4 registers. */
2380          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2381                   retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2382                   brw_imm_ud(4 * REG_SIZE));
2383       }
2384 
2385       /* We can only read two double components with each URB read, so
2386        * we send two read messages in that case, each one loading up to
2387        * two double components.
2388        */
2389       unsigned num_iterations = 1;
2390       unsigned num_components = instr->num_components;
2391       unsigned first_component = nir_intrinsic_component(instr);
2392       fs_reg orig_dst = dst;
2393       if (type_sz(dst.type) == 8) {
2394          first_component = first_component / 2;
2395          if (instr->num_components > 2) {
2396             num_iterations = 2;
2397             num_components = 2;
2398          }
2399 
2400          fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2401          dst = tmp;
2402       }
2403 
2404       for (unsigned iter = 0; iter < num_iterations; iter++) {
2405          if (indirect_offset.file == BAD_FILE) {
2406             /* Constant indexing - use global offset. */
2407             if (first_component != 0) {
2408                unsigned read_components = num_components + first_component;
2409                fs_reg tmp = bld.vgrf(dst.type, read_components);
2410                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2411                for (unsigned i = 0; i < num_components; i++) {
2412                   bld.MOV(offset(dst, bld, i),
2413                           offset(tmp, bld, i + first_component));
2414                }
2415             } else {
2416                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2417             }
2418             inst->offset = imm_offset;
2419             inst->mlen = 1;
2420          } else {
2421             /* Indirect indexing - use per-slot offsets as well. */
2422             const fs_reg srcs[] = { icp_handle, indirect_offset };
2423             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2424             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2425             if (first_component != 0) {
2426                unsigned read_components = num_components + first_component;
2427                fs_reg tmp = bld.vgrf(dst.type, read_components);
2428                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2429                                payload);
2430                for (unsigned i = 0; i < num_components; i++) {
2431                   bld.MOV(offset(dst, bld, i),
2432                           offset(tmp, bld, i + first_component));
2433                }
2434             } else {
2435                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2436                                payload);
2437             }
2438             inst->offset = imm_offset;
2439             inst->mlen = 2;
2440          }
2441          inst->size_written = (num_components + first_component) *
2442                               inst->dst.component_size(inst->exec_size);
2443 
2444          /* If we are reading 64-bit data using 32-bit read messages we need
2445           * build proper 64-bit data elements by shuffling the low and high
2446           * 32-bit components around like we do for other things like UBOs
2447           * or SSBOs.
2448           */
2449          if (type_sz(dst.type) == 8) {
2450             shuffle_32bit_load_result_to_64bit_data(
2451                bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
2452 
2453             for (unsigned c = 0; c < num_components; c++) {
2454                bld.MOV(offset(orig_dst, bld, iter * 2 + c),
2455                        offset(dst, bld, c));
2456             }
2457          }
2458 
2459          /* Copy the temporary to the destination to deal with writemasking.
2460           *
2461           * Also attempt to deal with gl_PointSize being in the .w component.
2462           */
2463          if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2464             assert(type_sz(dst.type) < 8);
2465             inst->dst = bld.vgrf(dst.type, 4);
2466             inst->size_written = 4 * REG_SIZE;
2467             bld.MOV(dst, offset(inst->dst, bld, 3));
2468          }
2469 
2470          /* If we are loading double data and we need a second read message
2471           * adjust the write offset
2472           */
2473          if (num_iterations > 1) {
2474             num_components = instr->num_components - 2;
2475             imm_offset++;
2476          }
2477       }
2478       break;
2479    }
2480 
2481    case nir_intrinsic_load_output:
2482    case nir_intrinsic_load_per_vertex_output: {
2483       fs_reg indirect_offset = get_indirect_offset(instr);
2484       unsigned imm_offset = instr->const_index[0];
2485       unsigned first_component = nir_intrinsic_component(instr);
2486 
2487       fs_inst *inst;
2488       if (indirect_offset.file == BAD_FILE) {
2489          /* Replicate the patch handle to all enabled channels */
2490          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2491          bld.MOV(patch_handle,
2492                  retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
2493 
2494          {
2495             if (first_component != 0) {
2496                unsigned read_components =
2497                   instr->num_components + first_component;
2498                fs_reg tmp = bld.vgrf(dst.type, read_components);
2499                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2500                                patch_handle);
2501                inst->size_written = read_components * REG_SIZE;
2502                for (unsigned i = 0; i < instr->num_components; i++) {
2503                   bld.MOV(offset(dst, bld, i),
2504                           offset(tmp, bld, i + first_component));
2505                }
2506             } else {
2507                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2508                                patch_handle);
2509                inst->size_written = instr->num_components * REG_SIZE;
2510             }
2511             inst->offset = imm_offset;
2512             inst->mlen = 1;
2513          }
2514       } else {
2515          /* Indirect indexing - use per-slot offsets as well. */
2516          const fs_reg srcs[] = {
2517             retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2518             indirect_offset
2519          };
2520          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2521          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2522          if (first_component != 0) {
2523             unsigned read_components =
2524                instr->num_components + first_component;
2525             fs_reg tmp = bld.vgrf(dst.type, read_components);
2526             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2527                             payload);
2528             inst->size_written = read_components * REG_SIZE;
2529             for (unsigned i = 0; i < instr->num_components; i++) {
2530                bld.MOV(offset(dst, bld, i),
2531                        offset(tmp, bld, i + first_component));
2532             }
2533          } else {
2534             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2535                             payload);
2536             inst->size_written = instr->num_components * REG_SIZE;
2537          }
2538          inst->offset = imm_offset;
2539          inst->mlen = 2;
2540       }
2541       break;
2542    }
2543 
2544    case nir_intrinsic_store_output:
2545    case nir_intrinsic_store_per_vertex_output: {
2546       fs_reg value = get_nir_src(instr->src[0]);
2547       bool is_64bit = (instr->src[0].is_ssa ?
2548          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
2549       fs_reg indirect_offset = get_indirect_offset(instr);
2550       unsigned imm_offset = instr->const_index[0];
2551       unsigned swiz = BRW_SWIZZLE_XYZW;
2552       unsigned mask = instr->const_index[1];
2553       unsigned header_regs = 0;
2554       fs_reg srcs[7];
2555       srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2556 
2557       if (indirect_offset.file != BAD_FILE) {
2558          srcs[header_regs++] = indirect_offset;
2559       }
2560 
2561       if (mask == 0)
2562          break;
2563 
2564       unsigned num_components = util_last_bit(mask);
2565       enum opcode opcode;
2566 
2567       /* We can only pack two 64-bit components in a single message, so send
2568        * 2 messages if we have more components
2569        */
2570       unsigned num_iterations = 1;
2571       unsigned iter_components = num_components;
2572       unsigned first_component = nir_intrinsic_component(instr);
2573       if (is_64bit) {
2574          first_component = first_component / 2;
2575          if (instr->num_components > 2) {
2576             num_iterations = 2;
2577             iter_components = 2;
2578          }
2579       }
2580 
2581       /* 64-bit data needs to me shuffled before we can write it to the URB.
2582        * We will use this temporary to shuffle the components in each
2583        * iteration.
2584        */
2585       fs_reg tmp =
2586          fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
2587 
2588       mask = mask << first_component;
2589 
2590       for (unsigned iter = 0; iter < num_iterations; iter++) {
2591          if (!is_64bit && mask != WRITEMASK_XYZW) {
2592             srcs[header_regs++] = brw_imm_ud(mask << 16);
2593             opcode = indirect_offset.file != BAD_FILE ?
2594                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2595                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2596          } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
2597             /* Expand the 64-bit mask to 32-bit channels. We only handle
2598              * two channels in each iteration, so we only care about X/Y.
2599              */
2600             unsigned mask32 = 0;
2601             if (mask & WRITEMASK_X)
2602                mask32 |= WRITEMASK_XY;
2603             if (mask & WRITEMASK_Y)
2604                mask32 |= WRITEMASK_ZW;
2605 
2606             /* If the mask does not include any of the channels X or Y there
2607              * is nothing to do in this iteration. Move on to the next couple
2608              * of 64-bit channels.
2609              */
2610             if (!mask32) {
2611                mask >>= 2;
2612                imm_offset++;
2613                continue;
2614             }
2615 
2616             srcs[header_regs++] = brw_imm_ud(mask32 << 16);
2617             opcode = indirect_offset.file != BAD_FILE ?
2618                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2619                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2620          } else {
2621             opcode = indirect_offset.file != BAD_FILE ?
2622                SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2623                SHADER_OPCODE_URB_WRITE_SIMD8;
2624          }
2625 
2626          for (unsigned i = 0; i < iter_components; i++) {
2627             if (!(mask & (1 << (i + first_component))))
2628                continue;
2629 
2630             if (!is_64bit) {
2631                srcs[header_regs + i + first_component] =
2632                   offset(value, bld, BRW_GET_SWZ(swiz, i));
2633             } else {
2634                /* We need to shuffle the 64-bit data to match the layout
2635                 * expected by our 32-bit URB write messages. We use a temporary
2636                 * for that.
2637                 */
2638                unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
2639                shuffle_64bit_data_for_32bit_write(bld,
2640                   retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
2641                   retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF),
2642                   1);
2643 
2644                /* Now copy the data to the destination */
2645                fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
2646                unsigned idx = 2 * i;
2647                bld.MOV(dest, offset(tmp, bld, idx));
2648                bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
2649                srcs[header_regs + idx + first_component * 2] = dest;
2650                srcs[header_regs + idx + 1 + first_component * 2] =
2651                   offset(dest, bld, 1);
2652             }
2653          }
2654 
2655          unsigned mlen =
2656             header_regs + (is_64bit ? 2 * iter_components : iter_components) +
2657             (is_64bit ? 2 * first_component : first_component);
2658          fs_reg payload =
2659             bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2660          bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2661 
2662          fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2663          inst->offset = imm_offset;
2664          inst->mlen = mlen;
2665 
2666          /* If this is a 64-bit attribute, select the next two 64-bit channels
2667           * to be handled in the next iteration.
2668           */
2669          if (is_64bit) {
2670             mask >>= 2;
2671             imm_offset++;
2672          }
2673       }
2674       break;
2675    }
2676 
2677    default:
2678       nir_emit_intrinsic(bld, instr);
2679       break;
2680    }
2681 }
2682 
2683 void
nir_emit_tes_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)2684 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2685                                    nir_intrinsic_instr *instr)
2686 {
2687    assert(stage == MESA_SHADER_TESS_EVAL);
2688    struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
2689 
2690    fs_reg dest;
2691    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2692       dest = get_nir_dest(instr->dest);
2693 
2694    switch (instr->intrinsic) {
2695    case nir_intrinsic_load_primitive_id:
2696       bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
2697       break;
2698    case nir_intrinsic_load_tess_coord:
2699       /* gl_TessCoord is part of the payload in g1-3 */
2700       for (unsigned i = 0; i < 3; i++) {
2701          bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
2702       }
2703       break;
2704 
2705    case nir_intrinsic_load_input:
2706    case nir_intrinsic_load_per_vertex_input: {
2707       fs_reg indirect_offset = get_indirect_offset(instr);
2708       unsigned imm_offset = instr->const_index[0];
2709       unsigned first_component = nir_intrinsic_component(instr);
2710 
2711       if (type_sz(dest.type) == 8) {
2712          first_component = first_component / 2;
2713       }
2714 
2715       fs_inst *inst;
2716       if (indirect_offset.file == BAD_FILE) {
2717          /* Arbitrarily only push up to 32 vec4 slots worth of data,
2718           * which is 16 registers (since each holds 2 vec4 slots).
2719           */
2720          const unsigned max_push_slots = 32;
2721          if (imm_offset < max_push_slots) {
2722             fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
2723             for (int i = 0; i < instr->num_components; i++) {
2724                unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
2725                   i + first_component;
2726                bld.MOV(offset(dest, bld, i), component(src, comp));
2727             }
2728             tes_prog_data->base.urb_read_length =
2729                MAX2(tes_prog_data->base.urb_read_length,
2730                     DIV_ROUND_UP(imm_offset + 1, 2));
2731          } else {
2732             /* Replicate the patch handle to all enabled channels */
2733             const fs_reg srcs[] = {
2734                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
2735             };
2736             fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2737             bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
2738 
2739             if (first_component != 0) {
2740                unsigned read_components =
2741                   instr->num_components + first_component;
2742                fs_reg tmp = bld.vgrf(dest.type, read_components);
2743                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2744                                patch_handle);
2745                inst->size_written = read_components * REG_SIZE;
2746                for (unsigned i = 0; i < instr->num_components; i++) {
2747                   bld.MOV(offset(dest, bld, i),
2748                           offset(tmp, bld, i + first_component));
2749                }
2750             } else {
2751                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
2752                                patch_handle);
2753                inst->size_written = instr->num_components * REG_SIZE;
2754             }
2755             inst->mlen = 1;
2756             inst->offset = imm_offset;
2757          }
2758       } else {
2759          /* Indirect indexing - use per-slot offsets as well. */
2760 
2761          /* We can only read two double components with each URB read, so
2762           * we send two read messages in that case, each one loading up to
2763           * two double components.
2764           */
2765          unsigned num_iterations = 1;
2766          unsigned num_components = instr->num_components;
2767          fs_reg orig_dest = dest;
2768          if (type_sz(dest.type) == 8) {
2769             if (instr->num_components > 2) {
2770                num_iterations = 2;
2771                num_components = 2;
2772             }
2773             fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
2774             dest = tmp;
2775          }
2776 
2777          for (unsigned iter = 0; iter < num_iterations; iter++) {
2778             const fs_reg srcs[] = {
2779                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2780                indirect_offset
2781             };
2782             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2783             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2784 
2785             if (first_component != 0) {
2786                unsigned read_components =
2787                    num_components + first_component;
2788                fs_reg tmp = bld.vgrf(dest.type, read_components);
2789                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2790                                payload);
2791                for (unsigned i = 0; i < num_components; i++) {
2792                   bld.MOV(offset(dest, bld, i),
2793                           offset(tmp, bld, i + first_component));
2794                }
2795             } else {
2796                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
2797                                payload);
2798             }
2799             inst->mlen = 2;
2800             inst->offset = imm_offset;
2801             inst->size_written = (num_components + first_component) *
2802                                  inst->dst.component_size(inst->exec_size);
2803 
2804             /* If we are reading 64-bit data using 32-bit read messages we need
2805              * build proper 64-bit data elements by shuffling the low and high
2806              * 32-bit components around like we do for other things like UBOs
2807              * or SSBOs.
2808              */
2809             if (type_sz(dest.type) == 8) {
2810                shuffle_32bit_load_result_to_64bit_data(
2811                   bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
2812 
2813                for (unsigned c = 0; c < num_components; c++) {
2814                   bld.MOV(offset(orig_dest, bld, iter * 2 + c),
2815                           offset(dest, bld, c));
2816                }
2817             }
2818 
2819             /* If we are loading double data and we need a second read message
2820              * adjust the offset
2821              */
2822             if (num_iterations > 1) {
2823                num_components = instr->num_components - 2;
2824                imm_offset++;
2825             }
2826          }
2827       }
2828       break;
2829    }
2830    default:
2831       nir_emit_intrinsic(bld, instr);
2832       break;
2833    }
2834 }
2835 
2836 void
nir_emit_gs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)2837 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
2838                                   nir_intrinsic_instr *instr)
2839 {
2840    assert(stage == MESA_SHADER_GEOMETRY);
2841    fs_reg indirect_offset;
2842 
2843    fs_reg dest;
2844    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2845       dest = get_nir_dest(instr->dest);
2846 
2847    switch (instr->intrinsic) {
2848    case nir_intrinsic_load_primitive_id:
2849       assert(stage == MESA_SHADER_GEOMETRY);
2850       assert(brw_gs_prog_data(prog_data)->include_primitive_id);
2851       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
2852               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
2853       break;
2854 
2855    case nir_intrinsic_load_input:
2856       unreachable("load_input intrinsics are invalid for the GS stage");
2857 
2858    case nir_intrinsic_load_per_vertex_input:
2859       emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
2860                          instr->src[1], instr->num_components,
2861                          nir_intrinsic_component(instr));
2862       break;
2863 
2864    case nir_intrinsic_emit_vertex_with_counter:
2865       emit_gs_vertex(instr->src[0], instr->const_index[0]);
2866       break;
2867 
2868    case nir_intrinsic_end_primitive_with_counter:
2869       emit_gs_end_primitive(instr->src[0]);
2870       break;
2871 
2872    case nir_intrinsic_set_vertex_count:
2873       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
2874       break;
2875 
2876    case nir_intrinsic_load_invocation_id: {
2877       fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
2878       assert(val.file != BAD_FILE);
2879       dest.type = val.type;
2880       bld.MOV(dest, val);
2881       break;
2882    }
2883 
2884    default:
2885       nir_emit_intrinsic(bld, instr);
2886       break;
2887    }
2888 }
2889 
2890 /**
2891  * Fetch the current render target layer index.
2892  */
2893 static fs_reg
fetch_render_target_array_index(const fs_builder & bld)2894 fetch_render_target_array_index(const fs_builder &bld)
2895 {
2896    if (bld.shader->devinfo->gen >= 6) {
2897       /* The render target array index is provided in the thread payload as
2898        * bits 26:16 of r0.0.
2899        */
2900       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
2901       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
2902               brw_imm_uw(0x7ff));
2903       return idx;
2904    } else {
2905       /* Pre-SNB we only ever render into the first layer of the framebuffer
2906        * since layered rendering is not implemented.
2907        */
2908       return brw_imm_ud(0);
2909    }
2910 }
2911 
2912 /**
2913  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
2914  * framebuffer at the current fragment coordinates and sample index.
2915  */
2916 fs_inst *
emit_non_coherent_fb_read(const fs_builder & bld,const fs_reg & dst,unsigned target)2917 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
2918                                       unsigned target)
2919 {
2920    const struct gen_device_info *devinfo = bld.shader->devinfo;
2921 
2922    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
2923    const brw_wm_prog_key *wm_key =
2924       reinterpret_cast<const brw_wm_prog_key *>(key);
2925    assert(!wm_key->coherent_fb_fetch);
2926    const struct brw_wm_prog_data *wm_prog_data =
2927       brw_wm_prog_data(stage_prog_data);
2928 
2929    /* Calculate the surface index relative to the start of the texture binding
2930     * table block, since that's what the texturing messages expect.
2931     */
2932    const unsigned surface = target +
2933       wm_prog_data->binding_table.render_target_read_start -
2934       wm_prog_data->base.binding_table.texture_start;
2935 
2936    brw_mark_surface_used(
2937       bld.shader->stage_prog_data,
2938       wm_prog_data->binding_table.render_target_read_start + target);
2939 
2940    /* Calculate the fragment coordinates. */
2941    const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
2942    bld.MOV(offset(coords, bld, 0), pixel_x);
2943    bld.MOV(offset(coords, bld, 1), pixel_y);
2944    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
2945 
2946    /* Calculate the sample index and MCS payload when multisampling.  Luckily
2947     * the MCS fetch message behaves deterministically for UMS surfaces, so it
2948     * shouldn't be necessary to recompile based on whether the framebuffer is
2949     * CMS or UMS.
2950     */
2951    if (wm_key->multisample_fbo &&
2952        nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
2953       nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
2954 
2955    const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
2956    const fs_reg mcs = wm_key->multisample_fbo ?
2957       emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
2958 
2959    /* Use either a normal or a CMS texel fetch message depending on whether
2960     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
2961     * message just in case the framebuffer uses 16x multisampling, it should
2962     * be equivalent to the normal CMS fetch for lower multisampling modes.
2963     */
2964    const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
2965                      devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
2966                      SHADER_OPCODE_TXF_CMS_LOGICAL;
2967 
2968    /* Emit the instruction. */
2969    const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(),
2970                            sample, mcs,
2971                            brw_imm_ud(surface), brw_imm_ud(0),
2972                            fs_reg(), brw_imm_ud(3), brw_imm_ud(0) };
2973    STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
2974 
2975    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
2976    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
2977 
2978    return inst;
2979 }
2980 
2981 /**
2982  * Actual coherent framebuffer read implemented using the native render target
2983  * read message.  Requires SKL+.
2984  */
2985 static fs_inst *
emit_coherent_fb_read(const fs_builder & bld,const fs_reg & dst,unsigned target)2986 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
2987 {
2988    assert(bld.shader->devinfo->gen >= 9);
2989    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
2990    inst->target = target;
2991    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
2992 
2993    return inst;
2994 }
2995 
2996 static fs_reg
alloc_temporary(const fs_builder & bld,unsigned size,fs_reg * regs,unsigned n)2997 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
2998 {
2999    if (n && regs[0].file != BAD_FILE) {
3000       return regs[0];
3001 
3002    } else {
3003       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3004 
3005       for (unsigned i = 0; i < n; i++)
3006          regs[i] = tmp;
3007 
3008       return tmp;
3009    }
3010 }
3011 
3012 static fs_reg
alloc_frag_output(fs_visitor * v,unsigned location)3013 alloc_frag_output(fs_visitor *v, unsigned location)
3014 {
3015    assert(v->stage == MESA_SHADER_FRAGMENT);
3016    const brw_wm_prog_key *const key =
3017       reinterpret_cast<const brw_wm_prog_key *>(v->key);
3018    const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3019    const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3020 
3021    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3022       return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3023 
3024    else if (l == FRAG_RESULT_COLOR)
3025       return alloc_temporary(v->bld, 4, v->outputs,
3026                              MAX2(key->nr_color_regions, 1));
3027 
3028    else if (l == FRAG_RESULT_DEPTH)
3029       return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3030 
3031    else if (l == FRAG_RESULT_STENCIL)
3032       return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3033 
3034    else if (l == FRAG_RESULT_SAMPLE_MASK)
3035       return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3036 
3037    else if (l >= FRAG_RESULT_DATA0 &&
3038             l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3039       return alloc_temporary(v->bld, 4,
3040                              &v->outputs[l - FRAG_RESULT_DATA0], 1);
3041 
3042    else
3043       unreachable("Invalid location");
3044 }
3045 
3046 void
nir_emit_fs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)3047 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3048                                   nir_intrinsic_instr *instr)
3049 {
3050    assert(stage == MESA_SHADER_FRAGMENT);
3051 
3052    fs_reg dest;
3053    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3054       dest = get_nir_dest(instr->dest);
3055 
3056    switch (instr->intrinsic) {
3057    case nir_intrinsic_load_front_face:
3058       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3059               *emit_frontfacing_interpolation());
3060       break;
3061 
3062    case nir_intrinsic_load_sample_pos: {
3063       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3064       assert(sample_pos.file != BAD_FILE);
3065       dest.type = sample_pos.type;
3066       bld.MOV(dest, sample_pos);
3067       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3068       break;
3069    }
3070 
3071    case nir_intrinsic_load_layer_id:
3072       dest.type = BRW_REGISTER_TYPE_UD;
3073       bld.MOV(dest, fetch_render_target_array_index(bld));
3074       break;
3075 
3076    case nir_intrinsic_load_helper_invocation:
3077    case nir_intrinsic_load_sample_mask_in:
3078    case nir_intrinsic_load_sample_id: {
3079       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3080       fs_reg val = nir_system_values[sv];
3081       assert(val.file != BAD_FILE);
3082       dest.type = val.type;
3083       bld.MOV(dest, val);
3084       break;
3085    }
3086 
3087    case nir_intrinsic_store_output: {
3088       const fs_reg src = get_nir_src(instr->src[0]);
3089       const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3090       assert(const_offset && "Indirect output stores not allowed");
3091       const unsigned location = nir_intrinsic_base(instr) +
3092          SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION);
3093       const fs_reg new_dest = retype(alloc_frag_output(this, location),
3094                                      src.type);
3095 
3096       for (unsigned j = 0; j < instr->num_components; j++)
3097          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3098                  offset(src, bld, j));
3099 
3100       break;
3101    }
3102 
3103    case nir_intrinsic_load_output: {
3104       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3105                                    BRW_NIR_FRAG_OUTPUT_LOCATION);
3106       assert(l >= FRAG_RESULT_DATA0);
3107       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3108       assert(const_offset && "Indirect output loads not allowed");
3109       const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0];
3110       const fs_reg tmp = bld.vgrf(dest.type, 4);
3111 
3112       if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3113          emit_coherent_fb_read(bld, tmp, target);
3114       else
3115          emit_non_coherent_fb_read(bld, tmp, target);
3116 
3117       for (unsigned j = 0; j < instr->num_components; j++) {
3118          bld.MOV(offset(dest, bld, j),
3119                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
3120       }
3121 
3122       break;
3123    }
3124 
3125    case nir_intrinsic_discard:
3126    case nir_intrinsic_discard_if: {
3127       /* We track our discarded pixels in f0.1.  By predicating on it, we can
3128        * update just the flag bits that aren't yet discarded.  If there's no
3129        * condition, we emit a CMP of g0 != g0, so all currently executing
3130        * channels will get turned off.
3131        */
3132       fs_inst *cmp;
3133       if (instr->intrinsic == nir_intrinsic_discard_if) {
3134          cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3135                        brw_imm_d(0), BRW_CONDITIONAL_Z);
3136       } else {
3137          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3138                                        BRW_REGISTER_TYPE_UW));
3139          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3140       }
3141       cmp->predicate = BRW_PREDICATE_NORMAL;
3142       cmp->flag_subreg = 1;
3143 
3144       if (devinfo->gen >= 6) {
3145          emit_discard_jump();
3146       }
3147       break;
3148    }
3149 
3150    case nir_intrinsic_load_input: {
3151       /* load_input is only used for flat inputs */
3152       unsigned base = nir_intrinsic_base(instr);
3153       unsigned component = nir_intrinsic_component(instr);
3154       unsigned num_components = instr->num_components;
3155       enum brw_reg_type type = dest.type;
3156 
3157       /* Special case fields in the VUE header */
3158       if (base == VARYING_SLOT_LAYER)
3159          component = 1;
3160       else if (base == VARYING_SLOT_VIEWPORT)
3161          component = 2;
3162 
3163       if (nir_dest_bit_size(instr->dest) == 64) {
3164          /* const_index is in 32-bit type size units that could not be aligned
3165           * with DF. We need to read the double vector as if it was a float
3166           * vector of twice the number of components to fetch the right data.
3167           */
3168          type = BRW_REGISTER_TYPE_F;
3169          num_components *= 2;
3170       }
3171 
3172       for (unsigned int i = 0; i < num_components; i++) {
3173          struct brw_reg interp = interp_reg(base, component + i);
3174          interp = suboffset(interp, 3);
3175          bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
3176                   retype(fs_reg(interp), type));
3177       }
3178 
3179       if (nir_dest_bit_size(instr->dest) == 64) {
3180          shuffle_32bit_load_result_to_64bit_data(bld,
3181                                                  dest,
3182                                                  retype(dest, type),
3183                                                  instr->num_components);
3184       }
3185       break;
3186    }
3187 
3188    case nir_intrinsic_load_barycentric_pixel:
3189    case nir_intrinsic_load_barycentric_centroid:
3190    case nir_intrinsic_load_barycentric_sample:
3191       /* Do nothing - load_interpolated_input handling will handle it later. */
3192       break;
3193 
3194    case nir_intrinsic_load_barycentric_at_sample: {
3195       const glsl_interp_mode interpolation =
3196          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3197 
3198       nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
3199 
3200       if (const_sample) {
3201          unsigned msg_data = const_sample->i32[0] << 4;
3202 
3203          emit_pixel_interpolater_send(bld,
3204                                       FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3205                                       dest,
3206                                       fs_reg(), /* src */
3207                                       brw_imm_ud(msg_data),
3208                                       interpolation);
3209       } else {
3210          const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3211                                           BRW_REGISTER_TYPE_UD);
3212 
3213          if (nir_src_is_dynamically_uniform(instr->src[0])) {
3214             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3215             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3216             bld.exec_all().group(1, 0)
3217                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3218             emit_pixel_interpolater_send(bld,
3219                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3220                                          dest,
3221                                          fs_reg(), /* src */
3222                                          msg_data,
3223                                          interpolation);
3224          } else {
3225             /* Make a loop that sends a message to the pixel interpolater
3226              * for the sample number in each live channel. If there are
3227              * multiple channels with the same sample number then these
3228              * will be handled simultaneously with a single interation of
3229              * the loop.
3230              */
3231             bld.emit(BRW_OPCODE_DO);
3232 
3233             /* Get the next live sample number into sample_id_reg */
3234             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3235 
3236             /* Set the flag register so that we can perform the send
3237              * message on all channels that have the same sample number
3238              */
3239             bld.CMP(bld.null_reg_ud(),
3240                     sample_src, sample_id,
3241                     BRW_CONDITIONAL_EQ);
3242             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3243             bld.exec_all().group(1, 0)
3244                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3245             fs_inst *inst =
3246                emit_pixel_interpolater_send(bld,
3247                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3248                                             dest,
3249                                             fs_reg(), /* src */
3250                                             msg_data,
3251                                             interpolation);
3252             set_predicate(BRW_PREDICATE_NORMAL, inst);
3253 
3254             /* Continue the loop if there are any live channels left */
3255             set_predicate_inv(BRW_PREDICATE_NORMAL,
3256                               true, /* inverse */
3257                               bld.emit(BRW_OPCODE_WHILE));
3258          }
3259       }
3260       break;
3261    }
3262 
3263    case nir_intrinsic_load_barycentric_at_offset: {
3264       const glsl_interp_mode interpolation =
3265          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3266 
3267       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3268 
3269       if (const_offset) {
3270          unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
3271          unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
3272 
3273          emit_pixel_interpolater_send(bld,
3274                                       FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3275                                       dest,
3276                                       fs_reg(), /* src */
3277                                       brw_imm_ud(off_x | (off_y << 4)),
3278                                       interpolation);
3279       } else {
3280          fs_reg src = vgrf(glsl_type::ivec2_type);
3281          fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3282                                     BRW_REGISTER_TYPE_F);
3283          for (int i = 0; i < 2; i++) {
3284             fs_reg temp = vgrf(glsl_type::float_type);
3285             bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3286             fs_reg itemp = vgrf(glsl_type::int_type);
3287             /* float to int */
3288             bld.MOV(itemp, temp);
3289 
3290             /* Clamp the upper end of the range to +7/16.
3291              * ARB_gpu_shader5 requires that we support a maximum offset
3292              * of +0.5, which isn't representable in a S0.4 value -- if
3293              * we didn't clamp it, we'd end up with -8/16, which is the
3294              * opposite of what the shader author wanted.
3295              *
3296              * This is legal due to ARB_gpu_shader5's quantization
3297              * rules:
3298              *
3299              * "Not all values of <offset> may be supported; x and y
3300              * offsets may be rounded to fixed-point values with the
3301              * number of fraction bits given by the
3302              * implementation-dependent constant
3303              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3304              */
3305             set_condmod(BRW_CONDITIONAL_L,
3306                         bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3307          }
3308 
3309          const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3310          emit_pixel_interpolater_send(bld,
3311                                       opcode,
3312                                       dest,
3313                                       src,
3314                                       brw_imm_ud(0u),
3315                                       interpolation);
3316       }
3317       break;
3318    }
3319 
3320    case nir_intrinsic_load_interpolated_input: {
3321       if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
3322          emit_fragcoord_interpolation(dest);
3323          break;
3324       }
3325 
3326       assert(instr->src[0].ssa &&
3327              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3328       nir_intrinsic_instr *bary_intrinsic =
3329          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3330       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3331       enum glsl_interp_mode interp_mode =
3332          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3333       fs_reg dst_xy;
3334 
3335       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3336           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3337          /* Use the result of the PI message */
3338          dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3339       } else {
3340          /* Use the delta_xy values computed from the payload */
3341          enum brw_barycentric_mode bary =
3342             brw_barycentric_mode(interp_mode, bary_intrin);
3343 
3344          dst_xy = this->delta_xy[bary];
3345       }
3346 
3347       for (unsigned int i = 0; i < instr->num_components; i++) {
3348          fs_reg interp =
3349             fs_reg(interp_reg(nir_intrinsic_base(instr),
3350                               nir_intrinsic_component(instr) + i));
3351          interp.type = BRW_REGISTER_TYPE_F;
3352          dest.type = BRW_REGISTER_TYPE_F;
3353 
3354          if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3355             fs_reg tmp = vgrf(glsl_type::float_type);
3356             bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3357             bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3358          } else {
3359             bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3360          }
3361       }
3362       break;
3363    }
3364 
3365    default:
3366       nir_emit_intrinsic(bld, instr);
3367       break;
3368    }
3369 }
3370 
3371 void
nir_emit_cs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)3372 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3373                                   nir_intrinsic_instr *instr)
3374 {
3375    assert(stage == MESA_SHADER_COMPUTE);
3376    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3377 
3378    fs_reg dest;
3379    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3380       dest = get_nir_dest(instr->dest);
3381 
3382    switch (instr->intrinsic) {
3383    case nir_intrinsic_barrier:
3384       emit_barrier();
3385       cs_prog_data->uses_barrier = true;
3386       break;
3387 
3388    case nir_intrinsic_load_local_invocation_id:
3389    case nir_intrinsic_load_work_group_id: {
3390       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3391       fs_reg val = nir_system_values[sv];
3392       assert(val.file != BAD_FILE);
3393       dest.type = val.type;
3394       for (unsigned i = 0; i < 3; i++)
3395          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3396       break;
3397    }
3398 
3399    case nir_intrinsic_load_num_work_groups: {
3400       const unsigned surface =
3401          cs_prog_data->binding_table.work_groups_start;
3402 
3403       cs_prog_data->uses_num_work_groups = true;
3404 
3405       fs_reg surf_index = brw_imm_ud(surface);
3406       brw_mark_surface_used(prog_data, surface);
3407 
3408       /* Read the 3 GLuint components of gl_NumWorkGroups */
3409       for (unsigned i = 0; i < 3; i++) {
3410          fs_reg read_result =
3411             emit_untyped_read(bld, surf_index,
3412                               brw_imm_ud(i << 2),
3413                               1 /* dims */, 1 /* size */,
3414                               BRW_PREDICATE_NONE);
3415          read_result.type = dest.type;
3416          bld.MOV(dest, read_result);
3417          dest = offset(dest, bld, 1);
3418       }
3419       break;
3420    }
3421 
3422    case nir_intrinsic_shared_atomic_add:
3423       nir_emit_shared_atomic(bld, BRW_AOP_ADD, instr);
3424       break;
3425    case nir_intrinsic_shared_atomic_imin:
3426       nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
3427       break;
3428    case nir_intrinsic_shared_atomic_umin:
3429       nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
3430       break;
3431    case nir_intrinsic_shared_atomic_imax:
3432       nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
3433       break;
3434    case nir_intrinsic_shared_atomic_umax:
3435       nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
3436       break;
3437    case nir_intrinsic_shared_atomic_and:
3438       nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
3439       break;
3440    case nir_intrinsic_shared_atomic_or:
3441       nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
3442       break;
3443    case nir_intrinsic_shared_atomic_xor:
3444       nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
3445       break;
3446    case nir_intrinsic_shared_atomic_exchange:
3447       nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
3448       break;
3449    case nir_intrinsic_shared_atomic_comp_swap:
3450       nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
3451       break;
3452 
3453    case nir_intrinsic_load_shared: {
3454       assert(devinfo->gen >= 7);
3455 
3456       fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
3457 
3458       /* Get the offset to read from */
3459       fs_reg offset_reg;
3460       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3461       if (const_offset) {
3462          offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
3463       } else {
3464          offset_reg = vgrf(glsl_type::uint_type);
3465          bld.ADD(offset_reg,
3466                  retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
3467                  brw_imm_ud(instr->const_index[0]));
3468       }
3469 
3470       /* Read the vector */
3471       do_untyped_vector_read(bld, dest, surf_index, offset_reg,
3472                              instr->num_components);
3473       break;
3474    }
3475 
3476    case nir_intrinsic_store_shared: {
3477       assert(devinfo->gen >= 7);
3478 
3479       /* Block index */
3480       fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
3481 
3482       /* Value */
3483       fs_reg val_reg = get_nir_src(instr->src[0]);
3484 
3485       /* Writemask */
3486       unsigned writemask = instr->const_index[1];
3487 
3488       /* get_nir_src() retypes to integer. Be wary of 64-bit types though
3489        * since the untyped writes below operate in units of 32-bits, which
3490        * means that we need to write twice as many components each time.
3491        * Also, we have to suffle 64-bit data to be in the appropriate layout
3492        * expected by our 32-bit write messages.
3493        */
3494       unsigned type_size = 4;
3495       unsigned bit_size = instr->src[0].is_ssa ?
3496          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
3497       if (bit_size == 64) {
3498          type_size = 8;
3499          fs_reg tmp =
3500            fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
3501          shuffle_64bit_data_for_32bit_write(
3502             bld,
3503             retype(tmp, BRW_REGISTER_TYPE_F),
3504             retype(val_reg, BRW_REGISTER_TYPE_DF),
3505             instr->num_components);
3506          val_reg = tmp;
3507       }
3508 
3509       unsigned type_slots = type_size / 4;
3510 
3511       /* Combine groups of consecutive enabled channels in one write
3512        * message. We use ffs to find the first enabled channel and then ffs on
3513        * the bit-inverse, down-shifted writemask to determine the length of
3514        * the block of enabled bits.
3515        */
3516       while (writemask) {
3517          unsigned first_component = ffs(writemask) - 1;
3518          unsigned length = ffs(~(writemask >> first_component)) - 1;
3519 
3520          /* We can't write more than 2 64-bit components at once. Limit the
3521           * length of the write to what we can do and let the next iteration
3522           * handle the rest
3523           */
3524          if (type_size > 4)
3525             length = MIN2(2, length);
3526 
3527          fs_reg offset_reg;
3528          nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3529          if (const_offset) {
3530             offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
3531                                     type_size * first_component);
3532          } else {
3533             offset_reg = vgrf(glsl_type::uint_type);
3534             bld.ADD(offset_reg,
3535                     retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
3536                     brw_imm_ud(instr->const_index[0] + type_size * first_component));
3537          }
3538 
3539          emit_untyped_write(bld, surf_index, offset_reg,
3540                             offset(val_reg, bld, first_component * type_slots),
3541                             1 /* dims */, length * type_slots,
3542                             BRW_PREDICATE_NONE);
3543 
3544          /* Clear the bits in the writemask that we just wrote, then try
3545           * again to see if more channels are left.
3546           */
3547          writemask &= (15 << (first_component + length));
3548       }
3549 
3550       break;
3551    }
3552 
3553    default:
3554       nir_emit_intrinsic(bld, instr);
3555       break;
3556    }
3557 }
3558 
3559 void
nir_emit_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)3560 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
3561 {
3562    fs_reg dest;
3563    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3564       dest = get_nir_dest(instr->dest);
3565 
3566    switch (instr->intrinsic) {
3567    case nir_intrinsic_atomic_counter_inc:
3568    case nir_intrinsic_atomic_counter_dec:
3569    case nir_intrinsic_atomic_counter_read:
3570    case nir_intrinsic_atomic_counter_add:
3571    case nir_intrinsic_atomic_counter_min:
3572    case nir_intrinsic_atomic_counter_max:
3573    case nir_intrinsic_atomic_counter_and:
3574    case nir_intrinsic_atomic_counter_or:
3575    case nir_intrinsic_atomic_counter_xor:
3576    case nir_intrinsic_atomic_counter_exchange:
3577    case nir_intrinsic_atomic_counter_comp_swap: {
3578       if (stage == MESA_SHADER_FRAGMENT &&
3579           instr->intrinsic != nir_intrinsic_atomic_counter_read)
3580          brw_wm_prog_data(prog_data)->has_side_effects = true;
3581 
3582       /* Get some metadata from the image intrinsic. */
3583       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3584 
3585       /* Get the arguments of the atomic intrinsic. */
3586       const fs_reg offset = get_nir_src(instr->src[0]);
3587       const unsigned surface = (stage_prog_data->binding_table.abo_start +
3588                                 instr->const_index[0]);
3589       const fs_reg src0 = (info->num_srcs >= 2
3590                            ? get_nir_src(instr->src[1]) : fs_reg());
3591       const fs_reg src1 = (info->num_srcs >= 3
3592                            ? get_nir_src(instr->src[2]) : fs_reg());
3593       fs_reg tmp;
3594 
3595       assert(info->num_srcs <= 3);
3596 
3597       /* Emit a surface read or atomic op. */
3598       if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
3599          tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
3600       } else {
3601          tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0,
3602                                    src1, 1, 1,
3603                                    get_atomic_counter_op(instr->intrinsic));
3604       }
3605 
3606       /* Assign the result. */
3607       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
3608 
3609       /* Mark the surface as used. */
3610       brw_mark_surface_used(stage_prog_data, surface);
3611       break;
3612    }
3613 
3614    case nir_intrinsic_image_load:
3615    case nir_intrinsic_image_store:
3616    case nir_intrinsic_image_atomic_add:
3617    case nir_intrinsic_image_atomic_min:
3618    case nir_intrinsic_image_atomic_max:
3619    case nir_intrinsic_image_atomic_and:
3620    case nir_intrinsic_image_atomic_or:
3621    case nir_intrinsic_image_atomic_xor:
3622    case nir_intrinsic_image_atomic_exchange:
3623    case nir_intrinsic_image_atomic_comp_swap: {
3624       using namespace image_access;
3625 
3626       if (stage == MESA_SHADER_FRAGMENT &&
3627           instr->intrinsic != nir_intrinsic_image_load)
3628          brw_wm_prog_data(prog_data)->has_side_effects = true;
3629 
3630       /* Get the referenced image variable and type. */
3631       const nir_variable *var = instr->variables[0]->var;
3632       const glsl_type *type = var->type->without_array();
3633       const brw_reg_type base_type = get_image_base_type(type);
3634 
3635       /* Get some metadata from the image intrinsic. */
3636       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3637       const unsigned arr_dims = type->sampler_array ? 1 : 0;
3638       const unsigned surf_dims = type->coordinate_components() - arr_dims;
3639       const unsigned format = var->data.image.format;
3640 
3641       /* Get the arguments of the image intrinsic. */
3642       const fs_reg image = get_nir_image_deref(instr->variables[0]);
3643       const fs_reg addr = retype(get_nir_src(instr->src[0]),
3644                                  BRW_REGISTER_TYPE_UD);
3645       const fs_reg src0 = (info->num_srcs >= 3 ?
3646                            retype(get_nir_src(instr->src[2]), base_type) :
3647                            fs_reg());
3648       const fs_reg src1 = (info->num_srcs >= 4 ?
3649                            retype(get_nir_src(instr->src[3]), base_type) :
3650                            fs_reg());
3651       fs_reg tmp;
3652 
3653       /* Emit an image load, store or atomic op. */
3654       if (instr->intrinsic == nir_intrinsic_image_load)
3655          tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
3656 
3657       else if (instr->intrinsic == nir_intrinsic_image_store)
3658          emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
3659                           var->data.image.write_only ? GL_NONE : format);
3660 
3661       else
3662          tmp = emit_image_atomic(bld, image, addr, src0, src1,
3663                                  surf_dims, arr_dims, info->dest_components,
3664                                  get_image_atomic_op(instr->intrinsic, type));
3665 
3666       /* Assign the result. */
3667       for (unsigned c = 0; c < info->dest_components; ++c)
3668          bld.MOV(offset(retype(dest, base_type), bld, c),
3669                  offset(tmp, bld, c));
3670       break;
3671    }
3672 
3673    case nir_intrinsic_memory_barrier_atomic_counter:
3674    case nir_intrinsic_memory_barrier_buffer:
3675    case nir_intrinsic_memory_barrier_image:
3676    case nir_intrinsic_memory_barrier: {
3677       const fs_builder ubld = bld.group(8, 0);
3678       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3679       ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
3680          ->size_written = 2 * REG_SIZE;
3681       break;
3682    }
3683 
3684    case nir_intrinsic_group_memory_barrier:
3685    case nir_intrinsic_memory_barrier_shared:
3686       /* We treat these workgroup-level barriers as no-ops.  This should be
3687        * safe at present and as long as:
3688        *
3689        *  - Memory access instructions are not subsequently reordered by the
3690        *    compiler back-end.
3691        *
3692        *  - All threads from a given compute shader workgroup fit within a
3693        *    single subslice and therefore talk to the same HDC shared unit
3694        *    what supposedly guarantees ordering and coherency between threads
3695        *    from the same workgroup.  This may change in the future when we
3696        *    start splitting workgroups across multiple subslices.
3697        *
3698        *  - The context is not in fault-and-stream mode, which could cause
3699        *    memory transactions (including to SLM) prior to the barrier to be
3700        *    replayed after the barrier if a pagefault occurs.  This shouldn't
3701        *    be a problem up to and including SKL because fault-and-stream is
3702        *    not usable due to hardware issues, but that's likely to change in
3703        *    the future.
3704        */
3705       break;
3706 
3707    case nir_intrinsic_shader_clock: {
3708       /* We cannot do anything if there is an event, so ignore it for now */
3709       const fs_reg shader_clock = get_timestamp(bld);
3710       const fs_reg srcs[] = { component(shader_clock, 0),
3711                               component(shader_clock, 1) };
3712       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3713       break;
3714    }
3715 
3716    case nir_intrinsic_image_size: {
3717       /* Get the referenced image variable and type. */
3718       const nir_variable *var = instr->variables[0]->var;
3719       const glsl_type *type = var->type->without_array();
3720 
3721       /* Get the size of the image. */
3722       const fs_reg image = get_nir_image_deref(instr->variables[0]);
3723       const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
3724 
3725       /* For 1DArray image types, the array index is stored in the Z component.
3726        * Fix this by swizzling the Z component to the Y component.
3727        */
3728       const bool is_1d_array_image =
3729                   type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
3730                   type->sampler_array;
3731 
3732       /* For CubeArray images, we should count the number of cubes instead
3733        * of the number of faces. Fix it by dividing the (Z component) by 6.
3734        */
3735       const bool is_cube_array_image =
3736                   type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
3737                   type->sampler_array;
3738 
3739       /* Copy all the components. */
3740       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3741       for (unsigned c = 0; c < info->dest_components; ++c) {
3742          if ((int)c >= type->coordinate_components()) {
3743              bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3744                      brw_imm_d(1));
3745          } else if (c == 1 && is_1d_array_image) {
3746             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3747                     offset(size, bld, 2));
3748          } else if (c == 2 && is_cube_array_image) {
3749             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
3750                      offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3751                      offset(size, bld, c), brw_imm_d(6));
3752          } else {
3753             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3754                     offset(size, bld, c));
3755          }
3756        }
3757 
3758       break;
3759    }
3760 
3761    case nir_intrinsic_image_samples:
3762       /* The driver does not support multi-sampled images. */
3763       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
3764       break;
3765 
3766    case nir_intrinsic_load_uniform: {
3767       /* Offsets are in bytes but they should always be multiples of 4 */
3768       assert(instr->const_index[0] % 4 == 0);
3769 
3770       fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
3771 
3772       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3773       if (const_offset) {
3774          /* Offsets are in bytes but they should always be multiples of 4 */
3775          assert(const_offset->u32[0] % 4 == 0);
3776          src.offset = const_offset->u32[0];
3777 
3778          for (unsigned j = 0; j < instr->num_components; j++) {
3779             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
3780          }
3781       } else {
3782          fs_reg indirect = retype(get_nir_src(instr->src[0]),
3783                                   BRW_REGISTER_TYPE_UD);
3784 
3785          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
3786           * go past the end of the uniform.  In order to keep the n'th
3787           * component from running past, we subtract off the size of all but
3788           * one component of the vector.
3789           */
3790          assert(instr->const_index[1] >=
3791                 instr->num_components * (int) type_sz(dest.type));
3792          unsigned read_size = instr->const_index[1] -
3793             (instr->num_components - 1) * type_sz(dest.type);
3794 
3795          bool supports_64bit_indirects =
3796             !devinfo->is_cherryview && !devinfo->is_broxton;
3797 
3798          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
3799             for (unsigned j = 0; j < instr->num_components; j++) {
3800                bld.emit(SHADER_OPCODE_MOV_INDIRECT,
3801                         offset(dest, bld, j), offset(src, bld, j),
3802                         indirect, brw_imm_ud(read_size));
3803             }
3804          } else {
3805             const unsigned num_mov_indirects =
3806                type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
3807             /* We read a little bit less per MOV INDIRECT, as they are now
3808              * 32-bits ones instead of 64-bit. Fix read_size then.
3809              */
3810             const unsigned read_size_32bit = read_size -
3811                 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
3812             for (unsigned j = 0; j < instr->num_components; j++) {
3813                for (unsigned i = 0; i < num_mov_indirects; i++) {
3814                   bld.emit(SHADER_OPCODE_MOV_INDIRECT,
3815                            subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
3816                            subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
3817                            indirect, brw_imm_ud(read_size_32bit));
3818                }
3819             }
3820          }
3821       }
3822       break;
3823    }
3824 
3825    case nir_intrinsic_load_ubo: {
3826       nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
3827       fs_reg surf_index;
3828 
3829       if (const_index) {
3830          const unsigned index = stage_prog_data->binding_table.ubo_start +
3831                                 const_index->u32[0];
3832          surf_index = brw_imm_ud(index);
3833          brw_mark_surface_used(prog_data, index);
3834       } else {
3835          /* The block index is not a constant. Evaluate the index expression
3836           * per-channel and add the base UBO index; we have to select a value
3837           * from any live channel.
3838           */
3839          surf_index = vgrf(glsl_type::uint_type);
3840          bld.ADD(surf_index, get_nir_src(instr->src[0]),
3841                  brw_imm_ud(stage_prog_data->binding_table.ubo_start));
3842          surf_index = bld.emit_uniformize(surf_index);
3843 
3844          /* Assume this may touch any UBO. It would be nice to provide
3845           * a tighter bound, but the array information is already lowered away.
3846           */
3847          brw_mark_surface_used(prog_data,
3848                                stage_prog_data->binding_table.ubo_start +
3849                                nir->info->num_ubos - 1);
3850       }
3851 
3852       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3853       if (const_offset == NULL) {
3854          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
3855                                      BRW_REGISTER_TYPE_UD);
3856 
3857          for (int i = 0; i < instr->num_components; i++)
3858             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
3859                                        base_offset, i * type_sz(dest.type));
3860       } else {
3861          /* Even if we are loading doubles, a pull constant load will load
3862           * a 32-bit vec4, so should only reserve vgrf space for that. If we
3863           * need to load a full dvec4 we will have to emit 2 loads. This is
3864           * similar to demote_pull_constants(), except that in that case we
3865           * see individual accesses to each component of the vector and then
3866           * we let CSE deal with duplicate loads. Here we see a vector access
3867           * and we have to split it if necessary.
3868           */
3869          const unsigned type_size = type_sz(dest.type);
3870          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
3871          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
3872          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
3873 
3874          for (unsigned c = 0; c < instr->num_components;) {
3875             const unsigned base = const_offset->u32[0] + c * type_size;
3876             /* Number of usable components in the next block-aligned load. */
3877             const unsigned count = MIN2(instr->num_components - c,
3878                                         (block_sz - base % block_sz) / type_size);
3879 
3880             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
3881                       packed_consts, surf_index,
3882                       brw_imm_ud(base & ~(block_sz - 1)));
3883 
3884             const fs_reg consts =
3885                retype(byte_offset(packed_consts, base & (block_sz - 1)),
3886                       dest.type);
3887 
3888             for (unsigned d = 0; d < count; d++)
3889                bld.MOV(offset(dest, bld, c + d), component(consts, d));
3890 
3891             c += count;
3892          }
3893       }
3894       break;
3895    }
3896 
3897    case nir_intrinsic_load_ssbo: {
3898       assert(devinfo->gen >= 7);
3899 
3900       nir_const_value *const_uniform_block =
3901          nir_src_as_const_value(instr->src[0]);
3902 
3903       fs_reg surf_index;
3904       if (const_uniform_block) {
3905          unsigned index = stage_prog_data->binding_table.ssbo_start +
3906                           const_uniform_block->u32[0];
3907          surf_index = brw_imm_ud(index);
3908          brw_mark_surface_used(prog_data, index);
3909       } else {
3910          surf_index = vgrf(glsl_type::uint_type);
3911          bld.ADD(surf_index, get_nir_src(instr->src[0]),
3912                  brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3913 
3914          /* Assume this may touch any UBO. It would be nice to provide
3915           * a tighter bound, but the array information is already lowered away.
3916           */
3917          brw_mark_surface_used(prog_data,
3918                                stage_prog_data->binding_table.ssbo_start +
3919                                nir->info->num_ssbos - 1);
3920       }
3921 
3922       fs_reg offset_reg;
3923       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3924       if (const_offset) {
3925          offset_reg = brw_imm_ud(const_offset->u32[0]);
3926       } else {
3927          offset_reg = get_nir_src(instr->src[1]);
3928       }
3929 
3930       /* Read the vector */
3931       do_untyped_vector_read(bld, dest, surf_index, offset_reg,
3932                              instr->num_components);
3933 
3934       break;
3935    }
3936 
3937    case nir_intrinsic_store_ssbo: {
3938       assert(devinfo->gen >= 7);
3939 
3940       if (stage == MESA_SHADER_FRAGMENT)
3941          brw_wm_prog_data(prog_data)->has_side_effects = true;
3942 
3943       /* Block index */
3944       fs_reg surf_index;
3945       nir_const_value *const_uniform_block =
3946          nir_src_as_const_value(instr->src[1]);
3947       if (const_uniform_block) {
3948          unsigned index = stage_prog_data->binding_table.ssbo_start +
3949                           const_uniform_block->u32[0];
3950          surf_index = brw_imm_ud(index);
3951          brw_mark_surface_used(prog_data, index);
3952       } else {
3953          surf_index = vgrf(glsl_type::uint_type);
3954          bld.ADD(surf_index, get_nir_src(instr->src[1]),
3955                   brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3956 
3957          brw_mark_surface_used(prog_data,
3958                                stage_prog_data->binding_table.ssbo_start +
3959                                nir->info->num_ssbos - 1);
3960       }
3961 
3962       /* Value */
3963       fs_reg val_reg = get_nir_src(instr->src[0]);
3964 
3965       /* Writemask */
3966       unsigned writemask = instr->const_index[0];
3967 
3968       /* get_nir_src() retypes to integer. Be wary of 64-bit types though
3969        * since the untyped writes below operate in units of 32-bits, which
3970        * means that we need to write twice as many components each time.
3971        * Also, we have to suffle 64-bit data to be in the appropriate layout
3972        * expected by our 32-bit write messages.
3973        */
3974       unsigned type_size = 4;
3975       unsigned bit_size = instr->src[0].is_ssa ?
3976          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
3977       if (bit_size == 64) {
3978          type_size = 8;
3979          fs_reg tmp =
3980            fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
3981          shuffle_64bit_data_for_32bit_write(bld,
3982             retype(tmp, BRW_REGISTER_TYPE_F),
3983             retype(val_reg, BRW_REGISTER_TYPE_DF),
3984             instr->num_components);
3985          val_reg = tmp;
3986       }
3987 
3988       unsigned type_slots = type_size / 4;
3989 
3990       /* Combine groups of consecutive enabled channels in one write
3991        * message. We use ffs to find the first enabled channel and then ffs on
3992        * the bit-inverse, down-shifted writemask to determine the length of
3993        * the block of enabled bits.
3994        */
3995       while (writemask) {
3996          unsigned first_component = ffs(writemask) - 1;
3997          unsigned length = ffs(~(writemask >> first_component)) - 1;
3998 
3999          /* We can't write more than 2 64-bit components at once. Limit the
4000           * length of the write to what we can do and let the next iteration
4001           * handle the rest
4002           */
4003          if (type_size > 4)
4004             length = MIN2(2, length);
4005 
4006          fs_reg offset_reg;
4007          nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
4008          if (const_offset) {
4009             offset_reg = brw_imm_ud(const_offset->u32[0] +
4010                                     type_size * first_component);
4011          } else {
4012             offset_reg = vgrf(glsl_type::uint_type);
4013             bld.ADD(offset_reg,
4014                     retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
4015                     brw_imm_ud(type_size * first_component));
4016          }
4017 
4018 
4019          emit_untyped_write(bld, surf_index, offset_reg,
4020                             offset(val_reg, bld, first_component * type_slots),
4021                             1 /* dims */, length * type_slots,
4022                             BRW_PREDICATE_NONE);
4023 
4024          /* Clear the bits in the writemask that we just wrote, then try
4025           * again to see if more channels are left.
4026           */
4027          writemask &= (15 << (first_component + length));
4028       }
4029       break;
4030    }
4031 
4032    case nir_intrinsic_store_output: {
4033       fs_reg src = get_nir_src(instr->src[0]);
4034 
4035       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
4036       assert(const_offset && "Indirect output stores not allowed");
4037       fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4038                                       4 * const_offset->u32[0]), src.type);
4039 
4040       unsigned num_components = instr->num_components;
4041       unsigned first_component = nir_intrinsic_component(instr);
4042       unsigned bit_size = instr->src[0].is_ssa ?
4043          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
4044       if (bit_size == 64) {
4045          fs_reg tmp =
4046             fs_reg(VGRF, alloc.allocate(2 * num_components),
4047                    BRW_REGISTER_TYPE_F);
4048          shuffle_64bit_data_for_32bit_write(
4049             bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
4050          src = retype(tmp, src.type);
4051          num_components *= 2;
4052       }
4053 
4054       for (unsigned j = 0; j < num_components; j++) {
4055          bld.MOV(offset(new_dest, bld, j + first_component),
4056                  offset(src, bld, j));
4057       }
4058       break;
4059    }
4060 
4061    case nir_intrinsic_ssbo_atomic_add:
4062       nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
4063       break;
4064    case nir_intrinsic_ssbo_atomic_imin:
4065       nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
4066       break;
4067    case nir_intrinsic_ssbo_atomic_umin:
4068       nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
4069       break;
4070    case nir_intrinsic_ssbo_atomic_imax:
4071       nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
4072       break;
4073    case nir_intrinsic_ssbo_atomic_umax:
4074       nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
4075       break;
4076    case nir_intrinsic_ssbo_atomic_and:
4077       nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
4078       break;
4079    case nir_intrinsic_ssbo_atomic_or:
4080       nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
4081       break;
4082    case nir_intrinsic_ssbo_atomic_xor:
4083       nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
4084       break;
4085    case nir_intrinsic_ssbo_atomic_exchange:
4086       nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
4087       break;
4088    case nir_intrinsic_ssbo_atomic_comp_swap:
4089       nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
4090       break;
4091 
4092    case nir_intrinsic_get_buffer_size: {
4093       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
4094       unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
4095 
4096       /* A resinfo's sampler message is used to get the buffer size.  The
4097        * SIMD8's writeback message consists of four registers and SIMD16's
4098        * writeback message consists of 8 destination registers (two per each
4099        * component).  Because we are only interested on the first channel of
4100        * the first returned component, where resinfo returns the buffer size
4101        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4102        * the dispatch width.
4103        */
4104       const fs_builder ubld = bld.exec_all().group(8, 0);
4105       fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4106       fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4107 
4108       /* Set LOD = 0 */
4109       ubld.MOV(src_payload, brw_imm_d(0));
4110 
4111       const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4112       fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload,
4113                                 src_payload, brw_imm_ud(index));
4114       inst->header_size = 0;
4115       inst->mlen = 1;
4116       inst->size_written = 4 * REG_SIZE;
4117 
4118       bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
4119       brw_mark_surface_used(prog_data, index);
4120       break;
4121    }
4122 
4123    case nir_intrinsic_load_channel_num: {
4124       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
4125       dest = retype(dest, BRW_REGISTER_TYPE_UD);
4126       const fs_builder allbld8 = bld.group(8, 0).exec_all();
4127       allbld8.MOV(tmp, brw_imm_v(0x76543210));
4128       if (dispatch_width > 8)
4129          allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
4130       if (dispatch_width > 16) {
4131          const fs_builder allbld16 = bld.group(16, 0).exec_all();
4132          allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
4133       }
4134       bld.MOV(dest, tmp);
4135       break;
4136    }
4137 
4138    default:
4139       unreachable("unknown intrinsic");
4140    }
4141 }
4142 
4143 void
nir_emit_ssbo_atomic(const fs_builder & bld,int op,nir_intrinsic_instr * instr)4144 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
4145                                  int op, nir_intrinsic_instr *instr)
4146 {
4147    if (stage == MESA_SHADER_FRAGMENT)
4148       brw_wm_prog_data(prog_data)->has_side_effects = true;
4149 
4150    fs_reg dest;
4151    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4152       dest = get_nir_dest(instr->dest);
4153 
4154    fs_reg surface;
4155    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
4156    if (const_surface) {
4157       unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
4158                             const_surface->u32[0];
4159       surface = brw_imm_ud(surf_index);
4160       brw_mark_surface_used(prog_data, surf_index);
4161    } else {
4162       surface = vgrf(glsl_type::uint_type);
4163       bld.ADD(surface, get_nir_src(instr->src[0]),
4164               brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4165 
4166       /* Assume this may touch any SSBO. This is the same we do for other
4167        * UBO/SSBO accesses with non-constant surface.
4168        */
4169       brw_mark_surface_used(prog_data,
4170                             stage_prog_data->binding_table.ssbo_start +
4171                             nir->info->num_ssbos - 1);
4172    }
4173 
4174    fs_reg offset = get_nir_src(instr->src[1]);
4175    fs_reg data1 = get_nir_src(instr->src[2]);
4176    fs_reg data2;
4177    if (op == BRW_AOP_CMPWR)
4178       data2 = get_nir_src(instr->src[3]);
4179 
4180    /* Emit the actual atomic operation */
4181 
4182    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4183                                               data1, data2,
4184                                               1 /* dims */, 1 /* rsize */,
4185                                               op,
4186                                               BRW_PREDICATE_NONE);
4187    dest.type = atomic_result.type;
4188    bld.MOV(dest, atomic_result);
4189 }
4190 
4191 void
nir_emit_shared_atomic(const fs_builder & bld,int op,nir_intrinsic_instr * instr)4192 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
4193                                    int op, nir_intrinsic_instr *instr)
4194 {
4195    fs_reg dest;
4196    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4197       dest = get_nir_dest(instr->dest);
4198 
4199    fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
4200    fs_reg offset;
4201    fs_reg data1 = get_nir_src(instr->src[1]);
4202    fs_reg data2;
4203    if (op == BRW_AOP_CMPWR)
4204       data2 = get_nir_src(instr->src[2]);
4205 
4206    /* Get the offset */
4207    nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
4208    if (const_offset) {
4209       offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
4210    } else {
4211       offset = vgrf(glsl_type::uint_type);
4212       bld.ADD(offset,
4213 	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4214 	      brw_imm_ud(instr->const_index[0]));
4215    }
4216 
4217    /* Emit the actual atomic operation operation */
4218 
4219    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4220                                               data1, data2,
4221                                               1 /* dims */, 1 /* rsize */,
4222                                               op,
4223                                               BRW_PREDICATE_NONE);
4224    dest.type = atomic_result.type;
4225    bld.MOV(dest, atomic_result);
4226 }
4227 
4228 void
nir_emit_texture(const fs_builder & bld,nir_tex_instr * instr)4229 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
4230 {
4231    unsigned texture = instr->texture_index;
4232    unsigned sampler = instr->sampler_index;
4233 
4234    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4235 
4236    srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
4237    srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
4238 
4239    int lod_components = 0;
4240 
4241    /* The hardware requires a LOD for buffer textures */
4242    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
4243       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
4244 
4245    uint32_t header_bits = 0;
4246    for (unsigned i = 0; i < instr->num_srcs; i++) {
4247       fs_reg src = get_nir_src(instr->src[i].src);
4248       switch (instr->src[i].src_type) {
4249       case nir_tex_src_bias:
4250          srcs[TEX_LOGICAL_SRC_LOD] =
4251             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4252          break;
4253       case nir_tex_src_comparator:
4254          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
4255          break;
4256       case nir_tex_src_coord:
4257          switch (instr->op) {
4258          case nir_texop_txf:
4259          case nir_texop_txf_ms:
4260          case nir_texop_txf_ms_mcs:
4261          case nir_texop_samples_identical:
4262             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
4263             break;
4264          default:
4265             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
4266             break;
4267          }
4268          break;
4269       case nir_tex_src_ddx:
4270          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
4271          lod_components = nir_tex_instr_src_size(instr, i);
4272          break;
4273       case nir_tex_src_ddy:
4274          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
4275          break;
4276       case nir_tex_src_lod:
4277          switch (instr->op) {
4278          case nir_texop_txs:
4279             srcs[TEX_LOGICAL_SRC_LOD] =
4280                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
4281             break;
4282          case nir_texop_txf:
4283             srcs[TEX_LOGICAL_SRC_LOD] =
4284                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
4285             break;
4286          default:
4287             srcs[TEX_LOGICAL_SRC_LOD] =
4288                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4289             break;
4290          }
4291          break;
4292       case nir_tex_src_ms_index:
4293          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
4294          break;
4295 
4296       case nir_tex_src_offset: {
4297          nir_const_value *const_offset =
4298             nir_src_as_const_value(instr->src[i].src);
4299          unsigned offset_bits = 0;
4300          if (const_offset &&
4301              brw_texture_offset(const_offset->i32,
4302                                 nir_tex_instr_src_size(instr, i),
4303                                 &offset_bits)) {
4304             header_bits |= offset_bits;
4305          } else {
4306             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
4307                retype(src, BRW_REGISTER_TYPE_D);
4308          }
4309          break;
4310       }
4311 
4312       case nir_tex_src_projector:
4313          unreachable("should be lowered");
4314 
4315       case nir_tex_src_texture_offset: {
4316          /* Figure out the highest possible texture index and mark it as used */
4317          uint32_t max_used = texture + instr->texture_array_size - 1;
4318          if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
4319             max_used += stage_prog_data->binding_table.gather_texture_start;
4320          } else {
4321             max_used += stage_prog_data->binding_table.texture_start;
4322          }
4323          brw_mark_surface_used(prog_data, max_used);
4324 
4325          /* Emit code to evaluate the actual indexing expression */
4326          fs_reg tmp = vgrf(glsl_type::uint_type);
4327          bld.ADD(tmp, src, brw_imm_ud(texture));
4328          srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
4329          break;
4330       }
4331 
4332       case nir_tex_src_sampler_offset: {
4333          /* Emit code to evaluate the actual indexing expression */
4334          fs_reg tmp = vgrf(glsl_type::uint_type);
4335          bld.ADD(tmp, src, brw_imm_ud(sampler));
4336          srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
4337          break;
4338       }
4339 
4340       case nir_tex_src_ms_mcs:
4341          assert(instr->op == nir_texop_txf_ms);
4342          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
4343          break;
4344 
4345       case nir_tex_src_plane: {
4346          nir_const_value *const_plane =
4347             nir_src_as_const_value(instr->src[i].src);
4348          const uint32_t plane = const_plane->u32[0];
4349          const uint32_t texture_index =
4350             instr->texture_index +
4351             stage_prog_data->binding_table.plane_start[plane] -
4352             stage_prog_data->binding_table.texture_start;
4353 
4354          srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
4355          break;
4356       }
4357 
4358       default:
4359          unreachable("unknown texture source");
4360       }
4361    }
4362 
4363    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
4364        (instr->op == nir_texop_txf_ms ||
4365         instr->op == nir_texop_samples_identical)) {
4366       if (devinfo->gen >= 7 &&
4367           key_tex->compressed_multisample_layout_mask & (1 << texture)) {
4368          srcs[TEX_LOGICAL_SRC_MCS] =
4369             emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
4370                            instr->coord_components,
4371                            srcs[TEX_LOGICAL_SRC_SURFACE]);
4372       } else {
4373          srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
4374       }
4375    }
4376 
4377    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
4378    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
4379 
4380    enum opcode opcode;
4381    switch (instr->op) {
4382    case nir_texop_tex:
4383       opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
4384                 SHADER_OPCODE_TXL_LOGICAL);
4385       break;
4386    case nir_texop_txb:
4387       opcode = FS_OPCODE_TXB_LOGICAL;
4388       break;
4389    case nir_texop_txl:
4390       opcode = SHADER_OPCODE_TXL_LOGICAL;
4391       break;
4392    case nir_texop_txd:
4393       opcode = SHADER_OPCODE_TXD_LOGICAL;
4394       break;
4395    case nir_texop_txf:
4396       opcode = SHADER_OPCODE_TXF_LOGICAL;
4397       break;
4398    case nir_texop_txf_ms:
4399       if ((key_tex->msaa_16 & (1 << sampler)))
4400          opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
4401       else
4402          opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
4403       break;
4404    case nir_texop_txf_ms_mcs:
4405       opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
4406       break;
4407    case nir_texop_query_levels:
4408    case nir_texop_txs:
4409       opcode = SHADER_OPCODE_TXS_LOGICAL;
4410       break;
4411    case nir_texop_lod:
4412       opcode = SHADER_OPCODE_LOD_LOGICAL;
4413       break;
4414    case nir_texop_tg4:
4415       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
4416          opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
4417       else
4418          opcode = SHADER_OPCODE_TG4_LOGICAL;
4419       break;
4420    case nir_texop_texture_samples:
4421       opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
4422       break;
4423    case nir_texop_samples_identical: {
4424       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
4425 
4426       /* If mcs is an immediate value, it means there is no MCS.  In that case
4427        * just return false.
4428        */
4429       if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
4430          bld.MOV(dst, brw_imm_ud(0u));
4431       } else if ((key_tex->msaa_16 & (1 << sampler))) {
4432          fs_reg tmp = vgrf(glsl_type::uint_type);
4433          bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
4434                 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
4435          bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
4436       } else {
4437          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
4438                  BRW_CONDITIONAL_EQ);
4439       }
4440       return;
4441    }
4442    default:
4443       unreachable("unknown texture opcode");
4444    }
4445 
4446    /* TXS and TXL require a LOD but not everything we implement using those
4447     * two opcodes provides one.  Provide a default LOD of 0.
4448     */
4449    if ((opcode == SHADER_OPCODE_TXS_LOGICAL ||
4450         opcode == SHADER_OPCODE_TXL_LOGICAL) &&
4451        srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE) {
4452       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u);
4453    }
4454 
4455    if (instr->op == nir_texop_tg4) {
4456       if (instr->component == 1 &&
4457           key_tex->gather_channel_quirk_mask & (1 << texture)) {
4458          /* gather4 sampler is broken for green channel on RG32F --
4459           * we must ask for blue instead.
4460           */
4461          header_bits |= 2 << 16;
4462       } else {
4463          header_bits |= instr->component << 16;
4464       }
4465    }
4466 
4467    fs_reg dst = bld.vgrf(brw_type_for_nir_type(instr->dest_type), 4);
4468    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
4469    inst->offset = header_bits;
4470 
4471    const unsigned dest_size = nir_tex_instr_dest_size(instr);
4472    if (devinfo->gen >= 9 &&
4473        instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
4474       unsigned write_mask = instr->dest.is_ssa ?
4475                             nir_ssa_def_components_read(&instr->dest.ssa):
4476                             (1 << dest_size) - 1;
4477       assert(write_mask != 0); /* dead code should have been eliminated */
4478       inst->size_written = util_last_bit(write_mask) *
4479                            inst->dst.component_size(inst->exec_size);
4480    } else {
4481       inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
4482    }
4483 
4484    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
4485       inst->shadow_compare = true;
4486 
4487    if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
4488       emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
4489 
4490    fs_reg nir_dest[4];
4491    for (unsigned i = 0; i < dest_size; i++)
4492       nir_dest[i] = offset(dst, bld, i);
4493 
4494    if (instr->op == nir_texop_query_levels) {
4495       /* # levels is in .w */
4496       nir_dest[0] = offset(dst, bld, 3);
4497    } else if (instr->op == nir_texop_txs &&
4498               dest_size >= 3 && devinfo->gen < 7) {
4499       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
4500       fs_reg depth = offset(dst, bld, 2);
4501       nir_dest[2] = vgrf(glsl_type::int_type);
4502       bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
4503    }
4504 
4505    bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
4506 }
4507 
4508 void
nir_emit_jump(const fs_builder & bld,nir_jump_instr * instr)4509 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
4510 {
4511    switch (instr->type) {
4512    case nir_jump_break:
4513       bld.emit(BRW_OPCODE_BREAK);
4514       break;
4515    case nir_jump_continue:
4516       bld.emit(BRW_OPCODE_CONTINUE);
4517       break;
4518    case nir_jump_return:
4519    default:
4520       unreachable("unknown jump");
4521    }
4522 }
4523 
4524 /**
4525  * This helper takes the result of a load operation that reads 32-bit elements
4526  * in this format:
4527  *
4528  * x x x x x x x x
4529  * y y y y y y y y
4530  * z z z z z z z z
4531  * w w w w w w w w
4532  *
4533  * and shuffles the data to get this:
4534  *
4535  * x y x y x y x y
4536  * x y x y x y x y
4537  * z w z w z w z w
4538  * z w z w z w z w
4539  *
4540  * Which is exactly what we want if the load is reading 64-bit components
4541  * like doubles, where x represents the low 32-bit of the x double component
4542  * and y represents the high 32-bit of the x double component (likewise with
4543  * z and w for double component y). The parameter @components represents
4544  * the number of 64-bit components present in @src. This would typically be
4545  * 2 at most, since we can only fit 2 double elements in the result of a
4546  * vec4 load.
4547  *
4548  * Notice that @dst and @src can be the same register.
4549  */
4550 void
shuffle_32bit_load_result_to_64bit_data(const fs_builder & bld,const fs_reg & dst,const fs_reg & src,uint32_t components)4551 shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
4552                                         const fs_reg &dst,
4553                                         const fs_reg &src,
4554                                         uint32_t components)
4555 {
4556    assert(type_sz(src.type) == 4);
4557    assert(type_sz(dst.type) == 8);
4558 
4559    /* A temporary that we will use to shuffle the 32-bit data of each
4560     * component in the vector into valid 64-bit data. We can't write directly
4561     * to dst because dst can be (and would usually be) the same as src
4562     * and in that case the first MOV in the loop below would overwrite the
4563     * data read in the second MOV.
4564     */
4565    fs_reg tmp = bld.vgrf(dst.type);
4566 
4567    for (unsigned i = 0; i < components; i++) {
4568       const fs_reg component_i = offset(src, bld, 2 * i);
4569 
4570       bld.MOV(subscript(tmp, src.type, 0), component_i);
4571       bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
4572 
4573       bld.MOV(offset(dst, bld, i), tmp);
4574    }
4575 }
4576 
4577 /**
4578  * This helper does the inverse operation of
4579  * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
4580  *
4581  * We need to do this when we are going to use untyped write messsages that
4582  * operate with 32-bit components in order to arrange our 64-bit data to be
4583  * in the expected layout.
4584  *
4585  * Notice that callers of this function, unlike in the case of the inverse
4586  * operation, would typically need to call this with dst and src being
4587  * different registers, since they would otherwise corrupt the original
4588  * 64-bit data they are about to write. Because of this the function checks
4589  * that the src and dst regions involved in the operation do not overlap.
4590  */
4591 void
shuffle_64bit_data_for_32bit_write(const fs_builder & bld,const fs_reg & dst,const fs_reg & src,uint32_t components)4592 shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
4593                                    const fs_reg &dst,
4594                                    const fs_reg &src,
4595                                    uint32_t components)
4596 {
4597    assert(type_sz(src.type) == 8);
4598    assert(type_sz(dst.type) == 4);
4599 
4600    assert(!regions_overlap(
4601              dst, 2 * components * dst.component_size(bld.dispatch_width()),
4602              src, components * src.component_size(bld.dispatch_width())));
4603 
4604    for (unsigned i = 0; i < components; i++) {
4605       const fs_reg component_i = offset(src, bld, i);
4606       bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
4607       bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
4608    }
4609 }
4610 
4611 fs_reg
setup_imm_df(const fs_builder & bld,double v)4612 setup_imm_df(const fs_builder &bld, double v)
4613 {
4614    const struct gen_device_info *devinfo = bld.shader->devinfo;
4615    assert(devinfo->gen >= 7);
4616 
4617    if (devinfo->gen >= 8)
4618       return brw_imm_df(v);
4619 
4620    /* gen7.5 does not support DF immediates straighforward but the DIM
4621     * instruction allows to set the 64-bit immediate value.
4622     */
4623    if (devinfo->is_haswell) {
4624       const fs_builder ubld = bld.exec_all().group(1, 0);
4625       fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
4626       ubld.DIM(dst, brw_imm_df(v));
4627       return component(dst, 0);
4628    }
4629 
4630    /* gen7 does not support DF immediates, so we generate a 64-bit constant by
4631     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
4632     * the high 32-bit to suboffset 4 and then applying a stride of 0.
4633     *
4634     * Alternatively, we could also produce a normal VGRF (without stride 0)
4635     * by writing to all the channels in the VGRF, however, that would hit the
4636     * gen7 bug where we have to split writes that span more than 1 register
4637     * into instructions with a width of 4 (otherwise the write to the second
4638     * register written runs into an execmask hardware bug) which isn't very
4639     * nice.
4640     */
4641    union {
4642       double d;
4643       struct {
4644          uint32_t i1;
4645          uint32_t i2;
4646       };
4647    } di;
4648 
4649    di.d = v;
4650 
4651    const fs_builder ubld = bld.exec_all().group(1, 0);
4652    const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4653    ubld.MOV(tmp, brw_imm_ud(di.i1));
4654    ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
4655 
4656    return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
4657 }
4658