• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_nir.h"
25 #include "elk_nir_private.h"
26 #include "elk_vec4.h"
27 #include "elk_vec4_builder.h"
28 #include "elk_vec4_surface_builder.h"
29 #include "elk_eu.h"
30 #include "nir.h"
31 #include "nir_intrinsics.h"
32 #include "nir_intrinsics_indices.h"
33 
34 using namespace elk;
35 using namespace elk::surface_access;
36 
37 namespace elk {
38 
39 void
emit_nir_code()40 vec4_visitor::emit_nir_code()
41 {
42    /* Globally set the rounding mode based on the float controls.  gen7 doesn't
43     * support 16-bit floats, and gen8 switches to scalar VS.  So we don't need
44     * to do any per-instruction mode switching the way the scalar FS handles.
45     */
46    emit_shader_float_controls_execution_mode();
47    if (nir->num_uniforms > 0)
48       nir_setup_uniforms();
49 
50    nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
51 }
52 
53 void
nir_setup_uniforms()54 vec4_visitor::nir_setup_uniforms()
55 {
56    uniforms = nir->num_uniforms / 16;
57 }
58 
59 void
nir_emit_impl(nir_function_impl * impl)60 vec4_visitor::nir_emit_impl(nir_function_impl *impl)
61 {
62    nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
63 
64    nir_emit_cf_list(&impl->body);
65 }
66 
67 void
nir_emit_cf_list(exec_list * list)68 vec4_visitor::nir_emit_cf_list(exec_list *list)
69 {
70    exec_list_validate(list);
71    foreach_list_typed(nir_cf_node, node, node, list) {
72       switch (node->type) {
73       case nir_cf_node_if:
74          nir_emit_if(nir_cf_node_as_if(node));
75          break;
76 
77       case nir_cf_node_loop:
78          nir_emit_loop(nir_cf_node_as_loop(node));
79          break;
80 
81       case nir_cf_node_block:
82          nir_emit_block(nir_cf_node_as_block(node));
83          break;
84 
85       default:
86          unreachable("Invalid CFG node block");
87       }
88    }
89 }
90 
91 void
nir_emit_if(nir_if * if_stmt)92 vec4_visitor::nir_emit_if(nir_if *if_stmt)
93 {
94    /* First, put the condition in f0 */
95    src_reg condition = get_nir_src(if_stmt->condition, ELK_REGISTER_TYPE_D, 1);
96    vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
97    inst->conditional_mod = ELK_CONDITIONAL_NZ;
98 
99    /* We can just predicate based on the X channel, as the condition only
100     * goes on its own line */
101    emit(IF(ELK_PREDICATE_ALIGN16_REPLICATE_X));
102 
103    nir_emit_cf_list(&if_stmt->then_list);
104 
105    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
106       emit(ELK_OPCODE_ELSE);
107       nir_emit_cf_list(&if_stmt->else_list);
108    }
109 
110    emit(ELK_OPCODE_ENDIF);
111 }
112 
113 void
nir_emit_loop(nir_loop * loop)114 vec4_visitor::nir_emit_loop(nir_loop *loop)
115 {
116    assert(!nir_loop_has_continue_construct(loop));
117    emit(ELK_OPCODE_DO);
118 
119    nir_emit_cf_list(&loop->body);
120 
121    emit(ELK_OPCODE_WHILE);
122 }
123 
124 void
nir_emit_block(nir_block * block)125 vec4_visitor::nir_emit_block(nir_block *block)
126 {
127    nir_foreach_instr(instr, block) {
128       nir_emit_instr(instr);
129    }
130 }
131 
132 void
nir_emit_instr(nir_instr * instr)133 vec4_visitor::nir_emit_instr(nir_instr *instr)
134 {
135    base_ir = instr;
136 
137    switch (instr->type) {
138    case nir_instr_type_load_const:
139       nir_emit_load_const(nir_instr_as_load_const(instr));
140       break;
141 
142    case nir_instr_type_intrinsic:
143       nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
144       break;
145 
146    case nir_instr_type_alu:
147       nir_emit_alu(nir_instr_as_alu(instr));
148       break;
149 
150    case nir_instr_type_jump:
151       nir_emit_jump(nir_instr_as_jump(instr));
152       break;
153 
154    case nir_instr_type_tex:
155       nir_emit_texture(nir_instr_as_tex(instr));
156       break;
157 
158    case nir_instr_type_undef:
159       nir_emit_undef(nir_instr_as_undef(instr));
160       break;
161 
162    default:
163       unreachable("VS instruction not yet implemented by NIR->vec4");
164    }
165 }
166 
167 static dst_reg
dst_reg_for_nir_reg(vec4_visitor * v,nir_def * handle,unsigned base_offset,nir_src * indirect)168 dst_reg_for_nir_reg(vec4_visitor *v, nir_def *handle,
169                     unsigned base_offset, nir_src *indirect)
170 {
171    nir_intrinsic_instr *decl = nir_reg_get_decl(handle);
172    dst_reg reg = v->nir_ssa_values[handle->index];
173    if (nir_intrinsic_bit_size(decl) == 64)
174       reg.type = ELK_REGISTER_TYPE_DF;
175 
176    reg = offset(reg, 8, base_offset);
177    if (indirect) {
178       reg.reladdr =
179          new(v->mem_ctx) src_reg(v->get_nir_src(*indirect,
180                                                 ELK_REGISTER_TYPE_D,
181                                                 1));
182    }
183    return reg;
184 }
185 
186 dst_reg
get_nir_def(const nir_def & def)187 vec4_visitor::get_nir_def(const nir_def &def)
188 {
189    nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
190    if (!store_reg) {
191       dst_reg dst =
192          dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(def.bit_size, 32)));
193       if (def.bit_size == 64)
194          dst.type = ELK_REGISTER_TYPE_DF;
195       nir_ssa_values[def.index] = dst;
196       return dst;
197    } else {
198       nir_src *indirect =
199          (store_reg->intrinsic == nir_intrinsic_store_reg_indirect) ?
200          &store_reg->src[2] : NULL;
201 
202       dst_reg dst = dst_reg_for_nir_reg(this, store_reg->src[1].ssa,
203                                         nir_intrinsic_base(store_reg),
204                                         indirect);
205       dst.writemask = nir_intrinsic_write_mask(store_reg);
206       return dst;
207    }
208 }
209 
210 dst_reg
get_nir_def(const nir_def & def,enum elk_reg_type type)211 vec4_visitor::get_nir_def(const nir_def &def, enum elk_reg_type type)
212 {
213    return retype(get_nir_def(def), type);
214 }
215 
216 dst_reg
get_nir_def(const nir_def & def,nir_alu_type type)217 vec4_visitor::get_nir_def(const nir_def &def, nir_alu_type type)
218 {
219    return get_nir_def(def, elk_type_for_nir_type(devinfo, type));
220 }
221 
222 src_reg
get_nir_src(const nir_src & src,enum elk_reg_type type,unsigned num_components)223 vec4_visitor::get_nir_src(const nir_src &src, enum elk_reg_type type,
224                           unsigned num_components)
225 {
226    nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
227 
228    dst_reg reg;
229    if (load_reg) {
230       nir_src *indirect =
231          (load_reg->intrinsic == nir_intrinsic_load_reg_indirect) ?
232          &load_reg->src[1] : NULL;
233 
234       reg = dst_reg_for_nir_reg(this, load_reg->src[0].ssa,
235                                       nir_intrinsic_base(load_reg),
236                                       indirect);
237    } else {
238       reg = nir_ssa_values[src.ssa->index];
239    }
240 
241    reg = retype(reg, type);
242 
243    src_reg reg_as_src = src_reg(reg);
244    reg_as_src.swizzle = elk_swizzle_for_size(num_components);
245    return reg_as_src;
246 }
247 
248 src_reg
get_nir_src(const nir_src & src,nir_alu_type type,unsigned num_components)249 vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type,
250                           unsigned num_components)
251 {
252    return get_nir_src(src, elk_type_for_nir_type(devinfo, type),
253                       num_components);
254 }
255 
256 src_reg
get_nir_src(const nir_src & src,unsigned num_components)257 vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components)
258 {
259    /* if type is not specified, default to signed int */
260    return get_nir_src(src, nir_type_int32, num_components);
261 }
262 
263 src_reg
get_nir_src_imm(const nir_src & src)264 vec4_visitor::get_nir_src_imm(const nir_src &src)
265 {
266    assert(nir_src_num_components(src) == 1);
267    assert(nir_src_bit_size(src) == 32);
268    return nir_src_is_const(src) ? src_reg(elk_imm_d(nir_src_as_int(src))) :
269                                   get_nir_src(src, 1);
270 }
271 
272 src_reg
get_indirect_offset(nir_intrinsic_instr * instr)273 vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
274 {
275    nir_src *offset_src = nir_get_io_offset_src(instr);
276 
277    if (nir_src_is_const(*offset_src)) {
278       /* The only constant offset we should find is 0.  elk_nir.c's
279        * add_const_offset_to_base() will fold other constant offsets
280        * into the base index.
281        */
282       assert(nir_src_as_uint(*offset_src) == 0);
283       return src_reg();
284    }
285 
286    return get_nir_src(*offset_src, ELK_REGISTER_TYPE_UD, 1);
287 }
288 
289 static src_reg
elk_setup_imm_df(const vec4_builder & bld,double v)290 elk_setup_imm_df(const vec4_builder &bld, double v)
291 {
292    const intel_device_info *devinfo = bld.shader->devinfo;
293    assert(devinfo->ver == 7);
294 
295    /* gfx7.5 does not support DF immediates straightforward but the DIM
296     * instruction allows to set the 64-bit immediate value.
297     */
298    if (devinfo->verx10 == 75) {
299       const vec4_builder ubld = bld.exec_all();
300       const dst_reg dst = bld.vgrf(ELK_REGISTER_TYPE_DF);
301       ubld.DIM(dst, elk_imm_df(v));
302       return swizzle(src_reg(dst), ELK_SWIZZLE_XXXX);
303    }
304 
305    /* gfx7 does not support DF immediates */
306    union {
307       double d;
308       struct {
309          uint32_t i1;
310          uint32_t i2;
311       };
312    } di;
313 
314    di.d = v;
315 
316    /* Write the low 32-bit of the constant to the X:UD channel and the
317     * high 32-bit to the Y:UD channel to build the constant in a VGRF.
318     * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes
319     * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle
320     * XXXX so any access to the VGRF only reads the constant data in these
321     * channels.
322     */
323    const dst_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD, 2);
324    for (unsigned n = 0; n < 2; n++) {
325       const vec4_builder ubld = bld.exec_all().group(4, n);
326       ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), elk_imm_ud(di.i1));
327       ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), elk_imm_ud(di.i2));
328    }
329 
330    return swizzle(src_reg(retype(tmp, ELK_REGISTER_TYPE_DF)), ELK_SWIZZLE_XXXX);
331 }
332 
333 void
nir_emit_load_const(nir_load_const_instr * instr)334 vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
335 {
336    dst_reg reg;
337 
338    if (instr->def.bit_size == 64) {
339       reg = dst_reg(VGRF, alloc.allocate(2));
340       reg.type = ELK_REGISTER_TYPE_DF;
341    } else {
342       reg = dst_reg(VGRF, alloc.allocate(1));
343       reg.type = ELK_REGISTER_TYPE_D;
344    }
345 
346    const vec4_builder ibld = vec4_builder(this).at_end();
347    unsigned remaining = elk_writemask_for_size(instr->def.num_components);
348 
349    /* @FIXME: consider emitting vector operations to save some MOVs in
350     * cases where the components are representable in 8 bits.
351     * For now, we emit a MOV for each distinct value.
352     */
353    for (unsigned i = 0; i < instr->def.num_components; i++) {
354       unsigned writemask = 1 << i;
355 
356       if ((remaining & writemask) == 0)
357          continue;
358 
359       for (unsigned j = i; j < instr->def.num_components; j++) {
360          if ((instr->def.bit_size == 32 &&
361               instr->value[i].u32 == instr->value[j].u32) ||
362              (instr->def.bit_size == 64 &&
363               instr->value[i].f64 == instr->value[j].f64)) {
364             writemask |= 1 << j;
365          }
366       }
367 
368       reg.writemask = writemask;
369       if (instr->def.bit_size == 64) {
370          emit(MOV(reg, elk_setup_imm_df(ibld, instr->value[i].f64)));
371       } else {
372          emit(MOV(reg, elk_imm_d(instr->value[i].i32)));
373       }
374 
375       remaining &= ~writemask;
376    }
377 
378    /* Set final writemask */
379    reg.writemask = elk_writemask_for_size(instr->def.num_components);
380 
381    nir_ssa_values[instr->def.index] = reg;
382 }
383 
384 src_reg
get_nir_ssbo_intrinsic_index(nir_intrinsic_instr * instr)385 vec4_visitor::get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr)
386 {
387    /* SSBO stores are weird in that their index is in src[1] */
388    const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
389 
390    if (nir_src_is_const(instr->src[src])) {
391       return elk_imm_ud(nir_src_as_uint(instr->src[src]));
392    } else {
393       return emit_uniformize(get_nir_src(instr->src[src]));
394    }
395 }
396 
397 void
nir_emit_intrinsic(nir_intrinsic_instr * instr)398 vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
399 {
400    dst_reg dest;
401    src_reg src;
402 
403    switch (instr->intrinsic) {
404    case nir_intrinsic_decl_reg: {
405       unsigned bit_size = nir_intrinsic_bit_size(instr);
406       unsigned array_elems = nir_intrinsic_num_array_elems(instr);
407       if (array_elems == 0)
408          array_elems = 1;
409 
410       const unsigned num_regs = array_elems * DIV_ROUND_UP(bit_size, 32);
411       dst_reg reg(VGRF, alloc.allocate(num_regs));
412       if (bit_size == 64)
413          reg.type = ELK_REGISTER_TYPE_DF;
414 
415       nir_ssa_values[instr->def.index] = reg;
416       break;
417    }
418 
419    case nir_intrinsic_load_reg:
420    case nir_intrinsic_load_reg_indirect:
421    case nir_intrinsic_store_reg:
422    case nir_intrinsic_store_reg_indirect:
423       /* Nothing to do with these. */
424       break;
425 
426    case nir_intrinsic_load_input: {
427       assert(instr->def.bit_size == 32);
428       /* We set EmitNoIndirectInput for VS */
429       unsigned load_offset = nir_src_as_uint(instr->src[0]);
430 
431       dest = get_nir_def(instr->def);
432 
433       src = src_reg(ATTR, nir_intrinsic_base(instr) + load_offset,
434                     glsl_uvec4_type());
435       src = retype(src, dest.type);
436 
437       /* Swizzle source based on component layout qualifier */
438       src.swizzle = ELK_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
439       emit(MOV(dest, src));
440       break;
441    }
442 
443    case nir_intrinsic_store_output: {
444       assert(nir_src_bit_size(instr->src[0]) == 32);
445       unsigned store_offset = nir_src_as_uint(instr->src[1]);
446       int varying = nir_intrinsic_base(instr) + store_offset;
447       src = get_nir_src(instr->src[0], ELK_REGISTER_TYPE_F,
448                         instr->num_components);
449 
450       unsigned c = nir_intrinsic_component(instr);
451       output_reg[varying][c] = dst_reg(src);
452       output_num_components[varying][c] = instr->num_components;
453       break;
454    }
455 
456    case nir_intrinsic_get_ssbo_size: {
457       assert(nir_src_num_components(instr->src[0]) == 1);
458       unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
459                             nir_src_as_uint(instr->src[0]) : 0;
460 
461       dst_reg result_dst = get_nir_def(instr->def);
462       vec4_instruction *inst = new(mem_ctx)
463          vec4_instruction(ELK_SHADER_OPCODE_GET_BUFFER_SIZE, result_dst);
464 
465       inst->base_mrf = 2;
466       inst->mlen = 1; /* always at least one */
467       inst->src[1] = elk_imm_ud(ssbo_index);
468 
469       /* MRF for the first parameter */
470       src_reg lod = elk_imm_d(0);
471       int param_base = inst->base_mrf;
472       int writemask = WRITEMASK_X;
473       emit(MOV(dst_reg(MRF, param_base, glsl_int_type(), writemask), lod));
474 
475       emit(inst);
476       break;
477    }
478 
479    case nir_intrinsic_store_ssbo: {
480       assert(devinfo->ver == 7);
481 
482       /* elk_nir_lower_mem_access_bit_sizes takes care of this */
483       assert(nir_src_bit_size(instr->src[0]) == 32);
484       assert(nir_intrinsic_write_mask(instr) ==
485              (1u << instr->num_components) - 1);
486 
487       src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
488       src_reg offset_reg = retype(get_nir_src_imm(instr->src[2]),
489                                   ELK_REGISTER_TYPE_UD);
490 
491       /* Value */
492       src_reg val_reg = get_nir_src(instr->src[0], ELK_REGISTER_TYPE_F, 4);
493 
494       /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped
495        * writes will use SIMD8 mode. In order to hide this and keep symmetry across
496        * typed and untyped messages and across hardware platforms, the
497        * current implementation of the untyped messages will transparently convert
498        * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it
499        * and enabling only channel X on the SEND instruction.
500        *
501        * The above, works well for full vector writes, but not for partial writes
502        * where we want to write some channels and not others, like when we have
503        * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are
504        * quite restrictive with regards to the channel enables we can configure in
505        * the message descriptor (not all combinations are allowed) we cannot simply
506        * implement these scenarios with a single message while keeping the
507        * aforementioned symmetry in the implementation. For now we de decided that
508        * it is better to keep the symmetry to reduce complexity, so in situations
509        * such as the one described we end up emitting two untyped write messages
510        * (one for xy and another for w).
511        *
512        * The code below packs consecutive channels into a single write message,
513        * detects gaps in the vector write and if needed, sends a second message
514        * with the remaining channels. If in the future we decide that we want to
515        * emit a single message at the expense of losing the symmetry in the
516        * implementation we can:
517        *
518        * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8
519        *    message payload. In this mode we can write up to 8 offsets and dwords
520        *    to the red channel only (for the two vec4s in the SIMD4x2 execution)
521        *    and select which of the 8 channels carry data to write by setting the
522        *    appropriate writemask in the dst register of the SEND instruction.
523        *    It would require to write a new generator opcode specifically for
524        *    IvyBridge since we would need to prepare a SIMD8 payload that could
525        *    use any channel, not just X.
526        *
527        * 2) For Haswell+: Simply send a single write message but set the writemask
528        *    on the dst of the SEND instruction to select the channels we want to
529        *    write. It would require to modify the current messages to receive
530        *    and honor the writemask provided.
531        */
532       const vec4_builder bld = vec4_builder(this).at_end()
533                                .annotate(current_annotation, base_ir);
534 
535       emit_untyped_write(bld, surf_index, offset_reg, val_reg,
536                          1 /* dims */, instr->num_components /* size */,
537                          ELK_PREDICATE_NONE);
538       break;
539    }
540 
541    case nir_intrinsic_load_ssbo: {
542       assert(devinfo->ver == 7);
543 
544       /* elk_nir_lower_mem_access_bit_sizes takes care of this */
545       assert(instr->def.bit_size == 32);
546 
547       src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
548       src_reg offset_reg = retype(get_nir_src_imm(instr->src[1]),
549                                   ELK_REGISTER_TYPE_UD);
550 
551       /* Read the vector */
552       const vec4_builder bld = vec4_builder(this).at_end()
553          .annotate(current_annotation, base_ir);
554 
555       src_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
556                                               1 /* dims */, 4 /* size*/,
557                                               ELK_PREDICATE_NONE);
558       dst_reg dest = get_nir_def(instr->def);
559       read_result.type = dest.type;
560       read_result.swizzle = elk_swizzle_for_size(instr->num_components);
561       emit(MOV(dest, read_result));
562       break;
563    }
564 
565    case nir_intrinsic_ssbo_atomic:
566    case nir_intrinsic_ssbo_atomic_swap:
567       nir_emit_ssbo_atomic(lsc_op_to_legacy_atomic(elk_lsc_aop_for_nir_intrinsic(instr)), instr);
568       break;
569 
570    case nir_intrinsic_load_vertex_id:
571       unreachable("should be lowered by vertex_id_zero_based");
572 
573    case nir_intrinsic_load_vertex_id_zero_base:
574    case nir_intrinsic_load_base_vertex:
575    case nir_intrinsic_load_instance_id:
576    case nir_intrinsic_load_base_instance:
577    case nir_intrinsic_load_draw_id:
578    case nir_intrinsic_load_invocation_id:
579       unreachable("should be lowered by elk_nir_lower_vs_inputs()");
580 
581    case nir_intrinsic_load_uniform: {
582       /* Offsets are in bytes but they should always be multiples of 4 */
583       assert(nir_intrinsic_base(instr) % 4 == 0);
584 
585       dest = get_nir_def(instr->def);
586 
587       src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16));
588       src.type = dest.type;
589 
590       /* Uniforms don't actually have to be vec4 aligned.  In the case that
591        * it isn't, we have to use a swizzle to shift things around.  They
592        * do still have the std140 alignment requirement that vec2's have to
593        * be vec2-aligned and vec3's and vec4's have to be vec4-aligned.
594        *
595        * The swizzle also works in the indirect case as the generator adds
596        * the swizzle to the offset for us.
597        */
598       const int type_size = type_sz(src.type);
599       unsigned shift = (nir_intrinsic_base(instr) % 16) / type_size;
600       assert(shift + instr->num_components <= 4);
601 
602       if (nir_src_is_const(instr->src[0])) {
603          const unsigned load_offset = nir_src_as_uint(instr->src[0]);
604          /* Offsets are in bytes but they should always be multiples of 4 */
605          assert(load_offset % 4 == 0);
606 
607          src.swizzle = elk_swizzle_for_size(instr->num_components);
608          dest.writemask = elk_writemask_for_size(instr->num_components);
609          unsigned offset = load_offset + shift * type_size;
610          src.offset = ROUND_DOWN_TO(offset, 16);
611          shift = (offset % 16) / type_size;
612          assert(shift + instr->num_components <= 4);
613          src.swizzle += ELK_SWIZZLE4(shift, shift, shift, shift);
614 
615          emit(MOV(dest, src));
616       } else {
617          /* Uniform arrays are vec4 aligned, because of std140 alignment
618           * rules.
619           */
620          assert(shift == 0);
621 
622          src_reg indirect = get_nir_src(instr->src[0], ELK_REGISTER_TYPE_UD, 1);
623 
624          /* MOV_INDIRECT is going to stomp the whole thing anyway */
625          dest.writemask = WRITEMASK_XYZW;
626 
627          emit(ELK_SHADER_OPCODE_MOV_INDIRECT, dest, src,
628               indirect, elk_imm_ud(nir_intrinsic_range(instr)));
629       }
630       break;
631    }
632 
633    case nir_intrinsic_load_ubo: {
634       src_reg surf_index;
635 
636       dest = get_nir_def(instr->def);
637 
638       if (nir_src_is_const(instr->src[0])) {
639          /* The block index is a constant, so just emit the binding table entry
640           * as an immediate.
641           */
642          const unsigned index = nir_src_as_uint(instr->src[0]);
643          surf_index = elk_imm_ud(index);
644       } else {
645          /* The block index is not a constant. Evaluate the index expression
646           * per-channel and add the base UBO index; we have to select a value
647           * from any live channel.
648           */
649          surf_index = src_reg(this, glsl_uint_type());
650          emit(MOV(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32,
651                                                    instr->num_components)));
652          surf_index = emit_uniformize(surf_index);
653       }
654 
655       src_reg push_reg;
656       src_reg offset_reg;
657       if (nir_src_is_const(instr->src[1])) {
658          unsigned load_offset = nir_src_as_uint(instr->src[1]);
659          unsigned aligned_offset = load_offset & ~15;
660          offset_reg = elk_imm_ud(aligned_offset);
661 
662          /* See if we've selected this as a push constant candidate */
663          if (nir_src_is_const(instr->src[0])) {
664             const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
665             const unsigned offset_256b = aligned_offset / 32;
666 
667             for (int i = 0; i < 4; i++) {
668                const struct elk_ubo_range *range = &prog_data->base.ubo_ranges[i];
669                if (range->block == ubo_block &&
670                    offset_256b >= range->start &&
671                    offset_256b < range->start + range->length) {
672 
673                   push_reg = src_reg(dst_reg(UNIFORM, UBO_START + i));
674                   push_reg.type = dest.type;
675                   push_reg.offset = aligned_offset - 32 * range->start;
676                   break;
677                }
678             }
679          }
680       } else {
681          offset_reg = src_reg(this, glsl_uint_type());
682          emit(MOV(dst_reg(offset_reg),
683                   get_nir_src(instr->src[1], nir_type_uint32, 1)));
684       }
685 
686       src_reg packed_consts;
687       if (push_reg.file != BAD_FILE) {
688          packed_consts = push_reg;
689       } else if (instr->def.bit_size == 32) {
690          packed_consts = src_reg(this, glsl_vec4_type());
691          emit_pull_constant_load_reg(dst_reg(packed_consts),
692                                      surf_index,
693                                      offset_reg,
694                                      NULL, NULL /* before_block/inst */);
695          prog_data->base.has_ubo_pull = true;
696       } else {
697          src_reg temp = src_reg(this, glsl_dvec4_type());
698          src_reg temp_float = retype(temp, ELK_REGISTER_TYPE_F);
699 
700          emit_pull_constant_load_reg(dst_reg(temp_float),
701                                      surf_index, offset_reg, NULL, NULL);
702          if (offset_reg.file == IMM)
703             offset_reg.ud += 16;
704          else
705             emit(ADD(dst_reg(offset_reg), offset_reg, elk_imm_ud(16u)));
706          emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)),
707                                      surf_index, offset_reg, NULL, NULL);
708          prog_data->base.has_ubo_pull = true;
709 
710          packed_consts = src_reg(this, glsl_dvec4_type());
711          shuffle_64bit_data(dst_reg(packed_consts), temp, false);
712       }
713 
714       packed_consts.swizzle = elk_swizzle_for_size(instr->num_components);
715       if (nir_src_is_const(instr->src[1])) {
716          unsigned load_offset = nir_src_as_uint(instr->src[1]);
717          unsigned type_size = type_sz(dest.type);
718          packed_consts.swizzle +=
719             ELK_SWIZZLE4(load_offset % 16 / type_size,
720                          load_offset % 16 / type_size,
721                          load_offset % 16 / type_size,
722                          load_offset % 16 / type_size);
723       }
724 
725       emit(MOV(dest, retype(packed_consts, dest.type)));
726 
727       break;
728    }
729 
730    case nir_intrinsic_barrier: {
731       if (nir_intrinsic_memory_scope(instr) == SCOPE_NONE)
732          break;
733       const vec4_builder bld =
734          vec4_builder(this).at_end().annotate(current_annotation, base_ir);
735       const dst_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
736       vec4_instruction *fence =
737          bld.emit(ELK_SHADER_OPCODE_MEMORY_FENCE, tmp, elk_vec8_grf(0, 0));
738       fence->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
739       break;
740    }
741 
742    case nir_intrinsic_shader_clock: {
743       /* We cannot do anything if there is an event, so ignore it for now */
744       const src_reg shader_clock = get_timestamp();
745       const enum elk_reg_type type = elk_type_for_base_type(glsl_uvec2_type());
746 
747       dest = get_nir_def(instr->def, type);
748       emit(MOV(dest, shader_clock));
749       break;
750    }
751 
752    default:
753       unreachable("Unknown intrinsic");
754    }
755 }
756 
757 void
nir_emit_ssbo_atomic(int op,nir_intrinsic_instr * instr)758 vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
759 {
760    dst_reg dest;
761    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
762       dest = get_nir_def(instr->def);
763 
764    src_reg surface = get_nir_ssbo_intrinsic_index(instr);
765    src_reg offset = get_nir_src(instr->src[1], 1);
766    src_reg data1;
767    if (op != ELK_AOP_INC && op != ELK_AOP_DEC && op != ELK_AOP_PREDEC)
768       data1 = get_nir_src(instr->src[2], 1);
769    src_reg data2;
770    if (op == ELK_AOP_CMPWR)
771       data2 = get_nir_src(instr->src[3], 1);
772 
773    /* Emit the actual atomic operation operation */
774    const vec4_builder bld =
775       vec4_builder(this).at_end().annotate(current_annotation, base_ir);
776 
777    src_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
778                                                data1, data2,
779                                                1 /* dims */, 1 /* rsize */,
780                                                op,
781                                                ELK_PREDICATE_NONE);
782    dest.type = atomic_result.type;
783    bld.MOV(dest, atomic_result);
784 }
785 
786 static unsigned
elk_swizzle_for_nir_swizzle(uint8_t swizzle[4])787 elk_swizzle_for_nir_swizzle(uint8_t swizzle[4])
788 {
789    return ELK_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
790 }
791 
792 bool
optimize_predicate(nir_alu_instr * instr,enum elk_predicate * predicate)793 vec4_visitor::optimize_predicate(nir_alu_instr *instr,
794                                  enum elk_predicate *predicate)
795 {
796    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
797       return false;
798 
799    nir_alu_instr *cmp_instr =
800       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
801 
802    switch (cmp_instr->op) {
803    case nir_op_b32any_fnequal2:
804    case nir_op_b32any_inequal2:
805    case nir_op_b32any_fnequal3:
806    case nir_op_b32any_inequal3:
807    case nir_op_b32any_fnequal4:
808    case nir_op_b32any_inequal4:
809       *predicate = ELK_PREDICATE_ALIGN16_ANY4H;
810       break;
811    case nir_op_b32all_fequal2:
812    case nir_op_b32all_iequal2:
813    case nir_op_b32all_fequal3:
814    case nir_op_b32all_iequal3:
815    case nir_op_b32all_fequal4:
816    case nir_op_b32all_iequal4:
817       *predicate = ELK_PREDICATE_ALIGN16_ALL4H;
818       break;
819    default:
820       return false;
821    }
822 
823    unsigned size_swizzle =
824       elk_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]);
825 
826    src_reg op[2];
827    assert(nir_op_infos[cmp_instr->op].num_inputs == 2);
828    for (unsigned i = 0; i < 2; i++) {
829       nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i];
830       unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src);
831       type = (nir_alu_type) (((unsigned) type) | bit_size);
832       op[i] = get_nir_src(cmp_instr->src[i].src, type, 4);
833       unsigned base_swizzle =
834          elk_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle);
835       op[i].swizzle = elk_compose_swizzle(size_swizzle, base_swizzle);
836    }
837 
838    emit(CMP(dst_null_d(), op[0], op[1],
839             elk_cmod_for_nir_comparison(cmp_instr->op)));
840 
841    return true;
842 }
843 
844 void
emit_conversion_from_double(dst_reg dst,src_reg src)845 vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src)
846 {
847    enum elk_opcode op;
848    switch (dst.type) {
849    case ELK_REGISTER_TYPE_D:
850       op = ELK_VEC4_OPCODE_DOUBLE_TO_D32;
851       break;
852    case ELK_REGISTER_TYPE_UD:
853       op = ELK_VEC4_OPCODE_DOUBLE_TO_U32;
854       break;
855    case ELK_REGISTER_TYPE_F:
856       op = ELK_VEC4_OPCODE_DOUBLE_TO_F32;
857       break;
858    default:
859       unreachable("Unknown conversion");
860    }
861 
862    dst_reg temp = dst_reg(this, glsl_dvec4_type());
863    emit(MOV(temp, src));
864    dst_reg temp2 = dst_reg(this, glsl_dvec4_type());
865    emit(op, temp2, src_reg(temp));
866 
867    emit(ELK_VEC4_OPCODE_PICK_LOW_32BIT, retype(temp2, dst.type), src_reg(temp2));
868    emit(MOV(dst, src_reg(retype(temp2, dst.type))));
869 }
870 
871 void
emit_conversion_to_double(dst_reg dst,src_reg src)872 vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src)
873 {
874    dst_reg tmp_dst = dst_reg(src_reg(this, glsl_dvec4_type()));
875    src_reg tmp_src = retype(src_reg(this, glsl_vec4_type()), src.type);
876    emit(MOV(dst_reg(tmp_src), src));
877    emit(ELK_VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src);
878    emit(MOV(dst, src_reg(tmp_dst)));
879 }
880 
881 /**
882  * Try to use an immediate value for a source
883  *
884  * In cases of flow control, constant propagation is sometimes unable to
885  * determine that a register contains a constant value.  To work around this,
886  * try to emit a literal as one of the sources.  If \c try_src0_also is set,
887  * \c op[0] will also be tried for an immediate value.
888  *
889  * If \c op[0] is modified, the operands will be exchanged so that \c op[1]
890  * will always be the immediate value.
891  *
892  * \return The index of the source that was modified, 0 or 1, if successful.
893  * Otherwise, -1.
894  *
895  * \param op - Operands to the instruction
896  * \param try_src0_also - True if \c op[0] should also be a candidate for
897  *                        getting an immediate value.  This should only be set
898  *                        for commutative operations.
899  */
900 static int
try_immediate_source(const nir_alu_instr * instr,src_reg * op,bool try_src0_also)901 try_immediate_source(const nir_alu_instr *instr, src_reg *op,
902                      bool try_src0_also)
903 {
904    unsigned idx;
905 
906    /* MOV should be the only single-source instruction passed to this
907     * function.  Any other unary instruction with a constant source should
908     * have been constant-folded away!
909     */
910    assert(nir_op_infos[instr->op].num_inputs > 1 ||
911           instr->op == nir_op_mov);
912 
913    if (instr->op != nir_op_mov &&
914        nir_src_bit_size(instr->src[1].src) == 32 &&
915        nir_src_is_const(instr->src[1].src)) {
916       idx = 1;
917    } else if (try_src0_also &&
918          nir_src_bit_size(instr->src[0].src) == 32 &&
919          nir_src_is_const(instr->src[0].src)) {
920       idx = 0;
921    } else {
922       return -1;
923    }
924 
925    const enum elk_reg_type old_type = op[idx].type;
926 
927    switch (old_type) {
928    case ELK_REGISTER_TYPE_D:
929    case ELK_REGISTER_TYPE_UD: {
930       int first_comp = -1;
931       int d = 0;
932 
933       for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
934          if (nir_alu_instr_channel_used(instr, idx, i)) {
935             if (first_comp < 0) {
936                first_comp = i;
937                d = nir_src_comp_as_int(instr->src[idx].src,
938                                        instr->src[idx].swizzle[i]);
939             } else if (d != nir_src_comp_as_int(instr->src[idx].src,
940                                                 instr->src[idx].swizzle[i])) {
941                return -1;
942             }
943          }
944       }
945 
946       assert(first_comp >= 0);
947 
948       if (op[idx].abs)
949          d = MAX2(-d, d);
950 
951       if (op[idx].negate)
952          d = -d;
953 
954       op[idx] = retype(src_reg(elk_imm_d(d)), old_type);
955       break;
956    }
957 
958    case ELK_REGISTER_TYPE_F: {
959       int first_comp = -1;
960       float f[NIR_MAX_VEC_COMPONENTS] = { 0.0f };
961       bool is_scalar = true;
962 
963       for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
964          if (nir_alu_instr_channel_used(instr, idx, i)) {
965             f[i] = nir_src_comp_as_float(instr->src[idx].src,
966                                          instr->src[idx].swizzle[i]);
967             if (first_comp < 0) {
968                first_comp = i;
969             } else if (f[first_comp] != f[i]) {
970                is_scalar = false;
971             }
972          }
973       }
974 
975       if (is_scalar) {
976          if (op[idx].abs)
977             f[first_comp] = fabs(f[first_comp]);
978 
979          if (op[idx].negate)
980             f[first_comp] = -f[first_comp];
981 
982          op[idx] = src_reg(elk_imm_f(f[first_comp]));
983          assert(op[idx].type == old_type);
984       } else {
985          uint8_t vf_values[4] = { 0, 0, 0, 0 };
986 
987          for (unsigned i = 0; i < ARRAY_SIZE(vf_values); i++) {
988 
989             if (op[idx].abs)
990                f[i] = fabs(f[i]);
991 
992             if (op[idx].negate)
993                f[i] = -f[i];
994 
995             const int vf = elk_float_to_vf(f[i]);
996             if (vf == -1)
997                return -1;
998 
999             vf_values[i] = vf;
1000          }
1001 
1002          op[idx] = src_reg(elk_imm_vf4(vf_values[0], vf_values[1],
1003                                        vf_values[2], vf_values[3]));
1004       }
1005       break;
1006    }
1007 
1008    default:
1009       unreachable("Non-32bit type.");
1010    }
1011 
1012    /* If the instruction has more than one source, the instruction format only
1013     * allows source 1 to be an immediate value.  If the immediate value was
1014     * source 0, then the sources must be exchanged.
1015     */
1016    if (idx == 0 && instr->op != nir_op_mov) {
1017       src_reg tmp = op[0];
1018       op[0] = op[1];
1019       op[1] = tmp;
1020    }
1021 
1022    return idx;
1023 }
1024 
1025 void
fix_float_operands(src_reg op[3],nir_alu_instr * instr)1026 vec4_visitor::fix_float_operands(src_reg op[3], nir_alu_instr *instr)
1027 {
1028    bool fixed[3] = { false, false, false };
1029 
1030    for (unsigned i = 0; i < 2; i++) {
1031       if (!nir_src_is_const(instr->src[i].src))
1032          continue;
1033 
1034       for (unsigned j = i + 1; j < 3; j++) {
1035          if (fixed[j])
1036             continue;
1037 
1038          if (!nir_src_is_const(instr->src[j].src))
1039             continue;
1040 
1041          if (nir_alu_srcs_equal(instr, instr, i, j)) {
1042             if (!fixed[i])
1043                op[i] = fix_3src_operand(op[i]);
1044 
1045             op[j] = op[i];
1046 
1047             fixed[i] = true;
1048             fixed[j] = true;
1049          } else if (nir_alu_srcs_negative_equal(instr, instr, i, j)) {
1050             if (!fixed[i])
1051                op[i] = fix_3src_operand(op[i]);
1052 
1053             op[j] = op[i];
1054             op[j].negate = !op[j].negate;
1055 
1056             fixed[i] = true;
1057             fixed[j] = true;
1058          }
1059       }
1060    }
1061 
1062    for (unsigned i = 0; i < 3; i++) {
1063       if (!fixed[i])
1064          op[i] = fix_3src_operand(op[i]);
1065    }
1066 }
1067 
1068 static bool
const_src_fits_in_16_bits(const nir_src & src,elk_reg_type type)1069 const_src_fits_in_16_bits(const nir_src &src, elk_reg_type type)
1070 {
1071    assert(nir_src_is_const(src));
1072    if (elk_reg_type_is_unsigned_integer(type)) {
1073       return nir_src_comp_as_uint(src, 0) <= UINT16_MAX;
1074    } else {
1075       const int64_t c = nir_src_comp_as_int(src, 0);
1076       return c <= INT16_MAX && c >= INT16_MIN;
1077    }
1078 }
1079 
1080 void
nir_emit_alu(nir_alu_instr * instr)1081 vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
1082 {
1083    vec4_instruction *inst;
1084 
1085    nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type |
1086                                            instr->def.bit_size);
1087    dst_reg dst = get_nir_def(instr->def, dst_type);
1088    dst.writemask &= nir_component_mask(instr->def.num_components);
1089 
1090    src_reg op[4];
1091    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1092       nir_alu_type src_type = (nir_alu_type)
1093          (nir_op_infos[instr->op].input_types[i] |
1094           nir_src_bit_size(instr->src[i].src));
1095       op[i] = get_nir_src(instr->src[i].src, src_type, 4);
1096       op[i].swizzle = elk_swizzle_for_nir_swizzle(instr->src[i].swizzle);
1097    }
1098 
1099 #ifndef NDEBUG
1100    /* On Gen7 and earlier, no functionality is exposed that should allow 8-bit
1101     * integer types to ever exist.
1102     */
1103    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1104       assert(type_sz(op[i].type) > 1);
1105 #endif
1106 
1107    switch (instr->op) {
1108    case nir_op_mov:
1109       try_immediate_source(instr, &op[0], true);
1110       inst = emit(MOV(dst, op[0]));
1111       break;
1112 
1113    case nir_op_vec2:
1114    case nir_op_vec3:
1115    case nir_op_vec4:
1116       unreachable("not reached: should be handled by lower_vec_to_movs()");
1117 
1118    case nir_op_i2f32:
1119    case nir_op_u2f32:
1120       inst = emit(MOV(dst, op[0]));
1121       break;
1122 
1123    case nir_op_f2f32:
1124    case nir_op_f2i32:
1125    case nir_op_f2u32:
1126       if (nir_src_bit_size(instr->src[0].src) == 64)
1127          emit_conversion_from_double(dst, op[0]);
1128       else
1129          inst = emit(MOV(dst, op[0]));
1130       break;
1131 
1132    case nir_op_f2f64:
1133    case nir_op_i2f64:
1134    case nir_op_u2f64:
1135       emit_conversion_to_double(dst, op[0]);
1136       break;
1137 
1138    case nir_op_fsat:
1139       inst = emit(MOV(dst, op[0]));
1140       inst->saturate = true;
1141       break;
1142 
1143    case nir_op_fneg:
1144    case nir_op_ineg:
1145       op[0].negate = true;
1146       inst = emit(MOV(dst, op[0]));
1147       break;
1148 
1149    case nir_op_fabs:
1150    case nir_op_iabs:
1151       op[0].negate = false;
1152       op[0].abs = true;
1153       inst = emit(MOV(dst, op[0]));
1154       break;
1155 
1156    case nir_op_iadd:
1157       assert(instr->def.bit_size < 64);
1158       FALLTHROUGH;
1159    case nir_op_fadd:
1160       try_immediate_source(instr, op, true);
1161       inst = emit(ADD(dst, op[0], op[1]));
1162       break;
1163 
1164    case nir_op_uadd_sat:
1165       assert(instr->def.bit_size < 64);
1166       inst = emit(ADD(dst, op[0], op[1]));
1167       inst->saturate = true;
1168       break;
1169 
1170    case nir_op_fmul:
1171       try_immediate_source(instr, op, true);
1172       inst = emit(MUL(dst, op[0], op[1]));
1173       break;
1174 
1175    case nir_op_imul: {
1176       assert(instr->def.bit_size < 64);
1177 
1178       /* For integer multiplication, the MUL uses the low 16 bits of one of
1179        * the operands (src0 through SNB, src1 on IVB and later). The MACH
1180        * accumulates in the contribution of the upper 16 bits of that
1181        * operand. If we can determine that one of the args is in the low
1182        * 16 bits, though, we can just emit a single MUL.
1183        */
1184       if (nir_src_is_const(instr->src[0].src) &&
1185           nir_alu_instr_src_read_mask(instr, 0) == 1 &&
1186           const_src_fits_in_16_bits(instr->src[0].src, op[0].type)) {
1187          if (devinfo->ver < 7)
1188             emit(MUL(dst, op[0], op[1]));
1189          else
1190             emit(MUL(dst, op[1], op[0]));
1191       } else if (nir_src_is_const(instr->src[1].src) &&
1192                  nir_alu_instr_src_read_mask(instr, 1) == 1 &&
1193                  const_src_fits_in_16_bits(instr->src[1].src, op[1].type)) {
1194          if (devinfo->ver < 7)
1195             emit(MUL(dst, op[1], op[0]));
1196          else
1197             emit(MUL(dst, op[0], op[1]));
1198       } else {
1199          struct elk_reg acc = retype(elk_acc_reg(8), dst.type);
1200 
1201          emit(MUL(acc, op[0], op[1]));
1202          emit(MACH(dst_null_d(), op[0], op[1]));
1203          emit(MOV(dst, src_reg(acc)));
1204       }
1205       break;
1206    }
1207 
1208    case nir_op_imul_high:
1209    case nir_op_umul_high: {
1210       assert(instr->def.bit_size < 64);
1211       struct elk_reg acc = retype(elk_acc_reg(8), dst.type);
1212 
1213       emit(MUL(acc, op[0], op[1]));
1214       emit(MACH(dst, op[0], op[1]));
1215       break;
1216    }
1217 
1218    case nir_op_frcp:
1219       inst = emit_math(ELK_SHADER_OPCODE_RCP, dst, op[0]);
1220       break;
1221 
1222    case nir_op_fexp2:
1223       inst = emit_math(ELK_SHADER_OPCODE_EXP2, dst, op[0]);
1224       break;
1225 
1226    case nir_op_flog2:
1227       inst = emit_math(ELK_SHADER_OPCODE_LOG2, dst, op[0]);
1228       break;
1229 
1230    case nir_op_fsin:
1231       inst = emit_math(ELK_SHADER_OPCODE_SIN, dst, op[0]);
1232       break;
1233 
1234    case nir_op_fcos:
1235       inst = emit_math(ELK_SHADER_OPCODE_COS, dst, op[0]);
1236       break;
1237 
1238    case nir_op_idiv:
1239    case nir_op_udiv:
1240       assert(instr->def.bit_size < 64);
1241       emit_math(ELK_SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]);
1242       break;
1243 
1244    case nir_op_umod:
1245    case nir_op_irem:
1246       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1247        * appears that our hardware just does the right thing for signed
1248        * remainder.
1249        */
1250       assert(instr->def.bit_size < 64);
1251       emit_math(ELK_SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
1252       break;
1253 
1254    case nir_op_imod: {
1255       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1256       inst = emit_math(ELK_SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
1257 
1258       /* Math instructions don't support conditional mod */
1259       inst = emit(MOV(dst_null_d(), src_reg(dst)));
1260       inst->conditional_mod = ELK_CONDITIONAL_NZ;
1261 
1262       /* Now, we need to determine if signs of the sources are different.
1263        * When we XOR the sources, the top bit is 0 if they are the same and 1
1264        * if they are different.  We can then use a conditional modifier to
1265        * turn that into a predicate.  This leads us to an XOR.l instruction.
1266        *
1267        * Technically, according to the PRM, you're not allowed to use .l on a
1268        * XOR instruction.  However, empirical experiments and Curro's reading
1269        * of the simulator source both indicate that it's safe.
1270        */
1271       src_reg tmp = src_reg(this, glsl_ivec4_type());
1272       inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
1273       inst->predicate = ELK_PREDICATE_NORMAL;
1274       inst->conditional_mod = ELK_CONDITIONAL_L;
1275 
1276       /* If the result of the initial remainder operation is non-zero and the
1277        * two sources have different signs, add in a copy of op[1] to get the
1278        * final integer modulus value.
1279        */
1280       inst = emit(ADD(dst, src_reg(dst), op[1]));
1281       inst->predicate = ELK_PREDICATE_NORMAL;
1282       break;
1283    }
1284 
1285    case nir_op_ldexp:
1286       unreachable("not reached: should be handled by ldexp_to_arith()");
1287 
1288    case nir_op_fsqrt:
1289       inst = emit_math(ELK_SHADER_OPCODE_SQRT, dst, op[0]);
1290       break;
1291 
1292    case nir_op_frsq:
1293       inst = emit_math(ELK_SHADER_OPCODE_RSQ, dst, op[0]);
1294       break;
1295 
1296    case nir_op_fpow:
1297       inst = emit_math(ELK_SHADER_OPCODE_POW, dst, op[0], op[1]);
1298       break;
1299 
1300    case nir_op_uadd_carry: {
1301       assert(instr->def.bit_size < 64);
1302       struct elk_reg acc = retype(elk_acc_reg(8), ELK_REGISTER_TYPE_UD);
1303 
1304       emit(ADDC(dst_null_ud(), op[0], op[1]));
1305       emit(MOV(dst, src_reg(acc)));
1306       break;
1307    }
1308 
1309    case nir_op_usub_borrow: {
1310       assert(instr->def.bit_size < 64);
1311       struct elk_reg acc = retype(elk_acc_reg(8), ELK_REGISTER_TYPE_UD);
1312 
1313       emit(SUBB(dst_null_ud(), op[0], op[1]));
1314       emit(MOV(dst, src_reg(acc)));
1315       break;
1316    }
1317 
1318    case nir_op_ftrunc:
1319       inst = emit(RNDZ(dst, op[0]));
1320       if (devinfo->ver < 6) {
1321          inst->conditional_mod = ELK_CONDITIONAL_R;
1322          inst = emit(ADD(dst, src_reg(dst), elk_imm_f(1.0f)));
1323          inst->predicate = ELK_PREDICATE_NORMAL;
1324          inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */
1325       }
1326       break;
1327 
1328    case nir_op_fceil: {
1329       src_reg tmp = src_reg(this, glsl_float_type());
1330       tmp.swizzle = elk_swizzle_for_size(nir_src_num_components(instr->src[0].src));
1331 
1332       op[0].negate = !op[0].negate;
1333       emit(RNDD(dst_reg(tmp), op[0]));
1334       tmp.negate = true;
1335       inst = emit(MOV(dst, tmp));
1336       break;
1337    }
1338 
1339    case nir_op_ffloor:
1340       inst = emit(RNDD(dst, op[0]));
1341       break;
1342 
1343    case nir_op_ffract:
1344       inst = emit(FRC(dst, op[0]));
1345       break;
1346 
1347    case nir_op_fround_even:
1348       inst = emit(RNDE(dst, op[0]));
1349       if (devinfo->ver < 6) {
1350          inst->conditional_mod = ELK_CONDITIONAL_R;
1351          inst = emit(ADD(dst, src_reg(dst), elk_imm_f(1.0f)));
1352          inst->predicate = ELK_PREDICATE_NORMAL;
1353          inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */
1354       }
1355       break;
1356 
1357    case nir_op_fquantize2f16: {
1358       /* See also vec4_visitor::emit_pack_half_2x16() */
1359       src_reg tmp16 = src_reg(this, glsl_uvec4_type());
1360       src_reg tmp32 = src_reg(this, glsl_vec4_type());
1361       src_reg zero = src_reg(this, glsl_vec4_type());
1362 
1363       /* Check for denormal */
1364       src_reg abs_src0 = op[0];
1365       abs_src0.abs = true;
1366       emit(CMP(dst_null_f(), abs_src0, elk_imm_f(ldexpf(1.0, -14)),
1367                ELK_CONDITIONAL_L));
1368       /* Get the appropriately signed zero */
1369       emit(AND(retype(dst_reg(zero), ELK_REGISTER_TYPE_UD),
1370                retype(op[0], ELK_REGISTER_TYPE_UD),
1371                elk_imm_ud(0x80000000)));
1372       /* Do the actual F32 -> F16 -> F32 conversion */
1373       emit(F32TO16(dst_reg(tmp16), op[0]));
1374       emit(F16TO32(dst_reg(tmp32), tmp16));
1375       /* Select that or zero based on normal status */
1376       inst = emit(ELK_OPCODE_SEL, dst, zero, tmp32);
1377       inst->predicate = ELK_PREDICATE_NORMAL;
1378       break;
1379    }
1380 
1381    case nir_op_imin:
1382    case nir_op_umin:
1383       assert(instr->def.bit_size < 64);
1384       FALLTHROUGH;
1385    case nir_op_fmin:
1386       try_immediate_source(instr, op, true);
1387       inst = emit_minmax(ELK_CONDITIONAL_L, dst, op[0], op[1]);
1388       break;
1389 
1390    case nir_op_imax:
1391    case nir_op_umax:
1392       assert(instr->def.bit_size < 64);
1393       FALLTHROUGH;
1394    case nir_op_fmax:
1395       try_immediate_source(instr, op, true);
1396       inst = emit_minmax(ELK_CONDITIONAL_GE, dst, op[0], op[1]);
1397       break;
1398 
1399    case nir_op_fddx:
1400    case nir_op_fddx_coarse:
1401    case nir_op_fddx_fine:
1402    case nir_op_fddy:
1403    case nir_op_fddy_coarse:
1404    case nir_op_fddy_fine:
1405       unreachable("derivatives are not valid in vertex shaders");
1406 
1407    case nir_op_ilt32:
1408    case nir_op_ult32:
1409    case nir_op_ige32:
1410    case nir_op_uge32:
1411    case nir_op_ieq32:
1412    case nir_op_ine32:
1413       assert(instr->def.bit_size < 64);
1414       FALLTHROUGH;
1415    case nir_op_flt32:
1416    case nir_op_fge32:
1417    case nir_op_feq32:
1418    case nir_op_fneu32: {
1419       enum elk_conditional_mod conditional_mod =
1420          elk_cmod_for_nir_comparison(instr->op);
1421 
1422       if (nir_src_bit_size(instr->src[0].src) < 64) {
1423          /* If the order of the sources is changed due to an immediate value,
1424           * then the condition must also be changed.
1425           */
1426          if (try_immediate_source(instr, op, true) == 0)
1427             conditional_mod = elk_swap_cmod(conditional_mod);
1428 
1429          emit(CMP(dst, op[0], op[1], conditional_mod));
1430       } else {
1431          /* Produce a 32-bit boolean result from the DF comparison by selecting
1432           * only the low 32-bit in each DF produced. Do this in a temporary
1433           * so we can then move from there to the result using align16 again
1434           * to honor the original writemask.
1435           */
1436          dst_reg temp = dst_reg(this, glsl_dvec4_type());
1437          emit(CMP(temp, op[0], op[1], conditional_mod));
1438          dst_reg result = dst_reg(this, glsl_bvec4_type());
1439          emit(ELK_VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp));
1440          emit(MOV(dst, src_reg(result)));
1441       }
1442       break;
1443    }
1444 
1445    case nir_op_b32all_iequal2:
1446    case nir_op_b32all_iequal3:
1447    case nir_op_b32all_iequal4:
1448       assert(instr->def.bit_size < 64);
1449       FALLTHROUGH;
1450    case nir_op_b32all_fequal2:
1451    case nir_op_b32all_fequal3:
1452    case nir_op_b32all_fequal4: {
1453       unsigned swiz =
1454          elk_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
1455 
1456       emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
1457                elk_cmod_for_nir_comparison(instr->op)));
1458       emit(MOV(dst, elk_imm_d(0)));
1459       inst = emit(MOV(dst, elk_imm_d(~0)));
1460       inst->predicate = ELK_PREDICATE_ALIGN16_ALL4H;
1461       break;
1462    }
1463 
1464    case nir_op_b32any_inequal2:
1465    case nir_op_b32any_inequal3:
1466    case nir_op_b32any_inequal4:
1467       assert(instr->def.bit_size < 64);
1468       FALLTHROUGH;
1469    case nir_op_b32any_fnequal2:
1470    case nir_op_b32any_fnequal3:
1471    case nir_op_b32any_fnequal4: {
1472       unsigned swiz =
1473          elk_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
1474 
1475       emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
1476                elk_cmod_for_nir_comparison(instr->op)));
1477 
1478       emit(MOV(dst, elk_imm_d(0)));
1479       inst = emit(MOV(dst, elk_imm_d(~0)));
1480       inst->predicate = ELK_PREDICATE_ALIGN16_ANY4H;
1481       break;
1482    }
1483 
1484    case nir_op_inot:
1485       assert(instr->def.bit_size < 64);
1486       emit(NOT(dst, op[0]));
1487       break;
1488 
1489    case nir_op_ixor:
1490       assert(instr->def.bit_size < 64);
1491       try_immediate_source(instr, op, true);
1492       emit(XOR(dst, op[0], op[1]));
1493       break;
1494 
1495    case nir_op_ior:
1496       assert(instr->def.bit_size < 64);
1497       try_immediate_source(instr, op, true);
1498       emit(OR(dst, op[0], op[1]));
1499       break;
1500 
1501    case nir_op_iand:
1502       assert(instr->def.bit_size < 64);
1503       try_immediate_source(instr, op, true);
1504       emit(AND(dst, op[0], op[1]));
1505       break;
1506 
1507    case nir_op_b2i32:
1508    case nir_op_b2f32:
1509    case nir_op_b2f64:
1510       if (instr->def.bit_size > 32) {
1511          assert(dst.type == ELK_REGISTER_TYPE_DF);
1512          emit_conversion_to_double(dst, negate(op[0]));
1513       } else {
1514          emit(MOV(dst, negate(op[0])));
1515       }
1516       break;
1517 
1518    case nir_op_unpack_half_2x16_split_x:
1519    case nir_op_unpack_half_2x16_split_y:
1520    case nir_op_pack_half_2x16_split:
1521       unreachable("not reached: should not occur in vertex shader");
1522 
1523    case nir_op_unpack_snorm_2x16:
1524    case nir_op_unpack_unorm_2x16:
1525    case nir_op_pack_snorm_2x16:
1526    case nir_op_pack_unorm_2x16:
1527       unreachable("not reached: should be handled by lower_packing_builtins");
1528 
1529    case nir_op_pack_uvec4_to_uint:
1530       unreachable("not reached");
1531 
1532    case nir_op_pack_uvec2_to_uint: {
1533       dst_reg tmp1 = dst_reg(this, glsl_uint_type());
1534       tmp1.writemask = WRITEMASK_X;
1535       op[0].swizzle = ELK_SWIZZLE_YYYY;
1536       emit(SHL(tmp1, op[0], src_reg(elk_imm_ud(16u))));
1537 
1538       dst_reg tmp2 = dst_reg(this, glsl_uint_type());
1539       tmp2.writemask = WRITEMASK_X;
1540       op[0].swizzle = ELK_SWIZZLE_XXXX;
1541       emit(AND(tmp2, op[0], src_reg(elk_imm_ud(0xffffu))));
1542 
1543       emit(OR(dst, src_reg(tmp1), src_reg(tmp2)));
1544       break;
1545    }
1546 
1547    case nir_op_pack_64_2x32_split: {
1548       dst_reg result = dst_reg(this, glsl_dvec4_type());
1549       dst_reg tmp = dst_reg(this, glsl_uvec4_type());
1550       emit(MOV(tmp, retype(op[0], ELK_REGISTER_TYPE_UD)));
1551       emit(ELK_VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp));
1552       emit(MOV(tmp, retype(op[1], ELK_REGISTER_TYPE_UD)));
1553       emit(ELK_VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp));
1554       emit(MOV(dst, src_reg(result)));
1555       break;
1556    }
1557 
1558    case nir_op_unpack_64_2x32_split_x:
1559    case nir_op_unpack_64_2x32_split_y: {
1560       enum elk_opcode oper = (instr->op == nir_op_unpack_64_2x32_split_x) ?
1561          ELK_VEC4_OPCODE_PICK_LOW_32BIT : ELK_VEC4_OPCODE_PICK_HIGH_32BIT;
1562       dst_reg tmp = dst_reg(this, glsl_dvec4_type());
1563       emit(MOV(tmp, op[0]));
1564       dst_reg tmp2 = dst_reg(this, glsl_uvec4_type());
1565       emit(oper, tmp2, src_reg(tmp));
1566       emit(MOV(dst, src_reg(tmp2)));
1567       break;
1568    }
1569 
1570    case nir_op_unpack_half_2x16:
1571       /* As NIR does not guarantee that we have a correct swizzle outside the
1572        * boundaries of a vector, and the implementation of emit_unpack_half_2x16
1573        * uses the source operand in an operation with WRITEMASK_Y while our
1574        * source operand has only size 1, it accessed incorrect data producing
1575        * regressions in Piglit. We repeat the swizzle of the first component on the
1576        * rest of components to avoid regressions. In the vec4_visitor IR code path
1577        * this is not needed because the operand has already the correct swizzle.
1578        */
1579       op[0].swizzle = elk_compose_swizzle(ELK_SWIZZLE_XXXX, op[0].swizzle);
1580       emit_unpack_half_2x16(dst, op[0]);
1581       break;
1582 
1583    case nir_op_pack_half_2x16:
1584       emit_pack_half_2x16(dst, op[0]);
1585       break;
1586 
1587    case nir_op_unpack_unorm_4x8:
1588       assert(instr->def.bit_size < 64);
1589       emit_unpack_unorm_4x8(dst, op[0]);
1590       break;
1591 
1592    case nir_op_pack_unorm_4x8:
1593       assert(instr->def.bit_size < 64);
1594       emit_pack_unorm_4x8(dst, op[0]);
1595       break;
1596 
1597    case nir_op_unpack_snorm_4x8:
1598       assert(instr->def.bit_size < 64);
1599       emit_unpack_snorm_4x8(dst, op[0]);
1600       break;
1601 
1602    case nir_op_pack_snorm_4x8:
1603       assert(instr->def.bit_size < 64);
1604       emit_pack_snorm_4x8(dst, op[0]);
1605       break;
1606 
1607    case nir_op_bitfield_reverse:
1608       assert(instr->def.bit_size == 32);
1609       assert(nir_src_bit_size(instr->src[0].src) == 32);
1610       emit(BFREV(dst, op[0]));
1611       break;
1612 
1613    case nir_op_bit_count:
1614       assert(instr->def.bit_size == 32);
1615       assert(nir_src_bit_size(instr->src[0].src) < 64);
1616       emit(CBIT(dst, op[0]));
1617       break;
1618 
1619    case nir_op_ifind_msb: {
1620       assert(instr->def.bit_size == 32);
1621       assert(nir_src_bit_size(instr->src[0].src) == 32);
1622       assert(devinfo->ver >= 7);
1623 
1624       vec4_builder bld = vec4_builder(this).at_end();
1625       src_reg src(dst);
1626 
1627       emit(FBH(retype(dst, ELK_REGISTER_TYPE_UD), op[0]));
1628 
1629       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1630        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1631        * subtract the result from 31 to convert the MSB count into an LSB
1632        * count.
1633        */
1634       bld.CMP(dst_null_d(), src, elk_imm_d(-1), ELK_CONDITIONAL_NZ);
1635 
1636       inst = bld.ADD(dst, src, elk_imm_d(31));
1637       inst->predicate = ELK_PREDICATE_NORMAL;
1638       inst->src[0].negate = true;
1639       break;
1640    }
1641 
1642    case nir_op_uclz:
1643       assert(instr->def.bit_size == 32);
1644       assert(nir_src_bit_size(instr->src[0].src) == 32);
1645       emit(LZD(dst, op[0]));
1646       break;
1647 
1648    case nir_op_find_lsb:
1649       assert(instr->def.bit_size == 32);
1650       assert(nir_src_bit_size(instr->src[0].src) == 32);
1651       assert(devinfo->ver >= 7);
1652       emit(FBL(dst, op[0]));
1653       break;
1654 
1655    case nir_op_ubitfield_extract:
1656    case nir_op_ibitfield_extract:
1657       unreachable("should have been lowered");
1658    case nir_op_ubfe:
1659    case nir_op_ibfe:
1660       assert(instr->def.bit_size < 64);
1661       op[0] = fix_3src_operand(op[0]);
1662       op[1] = fix_3src_operand(op[1]);
1663       op[2] = fix_3src_operand(op[2]);
1664 
1665       emit(BFE(dst, op[2], op[1], op[0]));
1666       break;
1667 
1668    case nir_op_bfm:
1669       assert(instr->def.bit_size < 64);
1670       emit(BFI1(dst, op[0], op[1]));
1671       break;
1672 
1673    case nir_op_bfi:
1674       assert(instr->def.bit_size < 64);
1675       op[0] = fix_3src_operand(op[0]);
1676       op[1] = fix_3src_operand(op[1]);
1677       op[2] = fix_3src_operand(op[2]);
1678 
1679       emit(BFI2(dst, op[0], op[1], op[2]));
1680       break;
1681 
1682    case nir_op_bitfield_insert:
1683       unreachable("not reached: should have been lowered");
1684 
1685    case nir_op_fsign:
1686        if (type_sz(op[0].type) < 8) {
1687          /* AND(val, 0x80000000) gives the sign bit.
1688           *
1689           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1690           * zero.
1691           */
1692          emit(CMP(dst_null_f(), op[0], elk_imm_f(0.0f), ELK_CONDITIONAL_NZ));
1693 
1694          op[0].type = ELK_REGISTER_TYPE_UD;
1695          dst.type = ELK_REGISTER_TYPE_UD;
1696          emit(AND(dst, op[0], elk_imm_ud(0x80000000u)));
1697 
1698          inst = emit(OR(dst, src_reg(dst), elk_imm_ud(0x3f800000u)));
1699          inst->predicate = ELK_PREDICATE_NORMAL;
1700          dst.type = ELK_REGISTER_TYPE_F;
1701       } else {
1702          /* For doubles we do the same but we need to consider:
1703           *
1704           * - We use a MOV with conditional_mod instead of a CMP so that we can
1705           *   skip loading a 0.0 immediate. We use a source modifier on the
1706           *   source of the MOV so that we flush denormalized values to 0.
1707           *   Since we want to compare against 0, this won't alter the result.
1708           * - We need to extract the high 32-bit of each DF where the sign
1709           *   is stored.
1710           * - We need to produce a DF result.
1711           */
1712 
1713          /* Check for zero */
1714          src_reg value = op[0];
1715          value.abs = true;
1716          inst = emit(MOV(dst_null_df(), value));
1717          inst->conditional_mod = ELK_CONDITIONAL_NZ;
1718 
1719          /* AND each high 32-bit channel with 0x80000000u */
1720          dst_reg tmp = dst_reg(this, glsl_uvec4_type());
1721          emit(ELK_VEC4_OPCODE_PICK_HIGH_32BIT, tmp, op[0]);
1722          emit(AND(tmp, src_reg(tmp), elk_imm_ud(0x80000000u)));
1723 
1724          /* Add 1.0 to each channel, predicated to skip the cases where the
1725           * channel's value was 0
1726           */
1727          inst = emit(OR(tmp, src_reg(tmp), elk_imm_ud(0x3f800000u)));
1728          inst->predicate = ELK_PREDICATE_NORMAL;
1729 
1730          /* Now convert the result from float to double */
1731          emit_conversion_to_double(dst, retype(src_reg(tmp),
1732                                                ELK_REGISTER_TYPE_F));
1733       }
1734       break;
1735 
1736    case nir_op_ishl:
1737       assert(instr->def.bit_size < 64);
1738       try_immediate_source(instr, op, false);
1739       emit(SHL(dst, op[0], op[1]));
1740       break;
1741 
1742    case nir_op_ishr:
1743       assert(instr->def.bit_size < 64);
1744       try_immediate_source(instr, op, false);
1745       emit(ASR(dst, op[0], op[1]));
1746       break;
1747 
1748    case nir_op_ushr:
1749       assert(instr->def.bit_size < 64);
1750       try_immediate_source(instr, op, false);
1751       emit(SHR(dst, op[0], op[1]));
1752       break;
1753 
1754    case nir_op_ffma:
1755       if (type_sz(dst.type) == 8) {
1756          dst_reg mul_dst = dst_reg(this, glsl_dvec4_type());
1757          emit(MUL(mul_dst, op[1], op[0]));
1758          inst = emit(ADD(dst, src_reg(mul_dst), op[2]));
1759       } else {
1760          fix_float_operands(op, instr);
1761          inst = emit(MAD(dst, op[2], op[1], op[0]));
1762       }
1763       break;
1764 
1765    case nir_op_flrp:
1766       fix_float_operands(op, instr);
1767       inst = emit(LRP(dst, op[2], op[1], op[0]));
1768       break;
1769 
1770    case nir_op_b32csel:
1771       enum elk_predicate predicate;
1772       if (!optimize_predicate(instr, &predicate)) {
1773          emit(CMP(dst_null_d(), op[0], elk_imm_d(0), ELK_CONDITIONAL_NZ));
1774          switch (dst.writemask) {
1775          case WRITEMASK_X:
1776             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_X;
1777             break;
1778          case WRITEMASK_Y:
1779             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_Y;
1780             break;
1781          case WRITEMASK_Z:
1782             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_Z;
1783             break;
1784          case WRITEMASK_W:
1785             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_W;
1786             break;
1787          default:
1788             predicate = ELK_PREDICATE_NORMAL;
1789             break;
1790          }
1791       }
1792       inst = emit(ELK_OPCODE_SEL, dst, op[1], op[2]);
1793       inst->predicate = predicate;
1794       break;
1795 
1796    case nir_op_fdot2_replicated:
1797       try_immediate_source(instr, op, true);
1798       inst = emit(ELK_OPCODE_DP2, dst, op[0], op[1]);
1799       break;
1800 
1801    case nir_op_fdot3_replicated:
1802       try_immediate_source(instr, op, true);
1803       inst = emit(ELK_OPCODE_DP3, dst, op[0], op[1]);
1804       break;
1805 
1806    case nir_op_fdot4_replicated:
1807       try_immediate_source(instr, op, true);
1808       inst = emit(ELK_OPCODE_DP4, dst, op[0], op[1]);
1809       break;
1810 
1811    case nir_op_fdph_replicated:
1812       try_immediate_source(instr, op, false);
1813       inst = emit(ELK_OPCODE_DPH, dst, op[0], op[1]);
1814       break;
1815 
1816    case nir_op_fdiv:
1817       unreachable("not reached: should be lowered by lower_fdiv in the compiler");
1818 
1819    case nir_op_fmod:
1820       unreachable("not reached: should be lowered by lower_fmod in the compiler");
1821 
1822    case nir_op_fsub:
1823    case nir_op_isub:
1824       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1825 
1826    default:
1827       unreachable("Unimplemented ALU operation");
1828    }
1829 
1830    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1831     * to sign extend the low bit to 0/~0
1832     */
1833    if (devinfo->ver <= 5 &&
1834        (instr->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) ==
1835        ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
1836       dst_reg masked = dst_reg(this, glsl_int_type());
1837       masked.writemask = dst.writemask;
1838       emit(AND(masked, src_reg(dst), elk_imm_d(1)));
1839       src_reg masked_neg = src_reg(masked);
1840       masked_neg.negate = true;
1841       emit(MOV(retype(dst, ELK_REGISTER_TYPE_D), masked_neg));
1842    }
1843 }
1844 
1845 void
nir_emit_jump(nir_jump_instr * instr)1846 vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
1847 {
1848    switch (instr->type) {
1849    case nir_jump_break:
1850       emit(ELK_OPCODE_BREAK);
1851       break;
1852 
1853    case nir_jump_continue:
1854       emit(ELK_OPCODE_CONTINUE);
1855       break;
1856 
1857    case nir_jump_return:
1858       FALLTHROUGH;
1859    default:
1860       unreachable("unknown jump");
1861    }
1862 }
1863 
1864 static bool
is_high_sampler(const struct intel_device_info * devinfo,src_reg sampler)1865 is_high_sampler(const struct intel_device_info *devinfo, src_reg sampler)
1866 {
1867    if (devinfo->verx10 != 75)
1868       return false;
1869 
1870    return sampler.file != IMM || sampler.ud >= 16;
1871 }
1872 
1873 void
nir_emit_texture(nir_tex_instr * instr)1874 vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
1875 {
1876    unsigned texture = instr->texture_index;
1877    unsigned sampler = instr->sampler_index;
1878    src_reg texture_reg = elk_imm_ud(texture);
1879    src_reg sampler_reg = elk_imm_ud(sampler);
1880    src_reg coordinate;
1881    const glsl_type *coord_type = NULL;
1882    src_reg shadow_comparator;
1883    src_reg offset_value;
1884    src_reg lod, lod2;
1885    src_reg sample_index;
1886    src_reg mcs;
1887 
1888    dst_reg dest = get_nir_def(instr->def, instr->dest_type);
1889 
1890    /* The hardware requires a LOD for buffer textures */
1891    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
1892       lod = elk_imm_d(0);
1893 
1894    /* Load the texture operation sources */
1895    uint32_t constant_offset = 0;
1896    for (unsigned i = 0; i < instr->num_srcs; i++) {
1897       switch (instr->src[i].src_type) {
1898       case nir_tex_src_comparator:
1899          shadow_comparator = get_nir_src(instr->src[i].src,
1900                                          ELK_REGISTER_TYPE_F, 1);
1901          break;
1902 
1903       case nir_tex_src_coord: {
1904          unsigned src_size = nir_tex_instr_src_size(instr, i);
1905 
1906          switch (instr->op) {
1907          case nir_texop_txf:
1908          case nir_texop_txf_ms:
1909          case nir_texop_samples_identical:
1910             coordinate = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D,
1911                                      src_size);
1912             coord_type = glsl_ivec_type(src_size);
1913             break;
1914 
1915          default:
1916             coordinate = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F,
1917                                      src_size);
1918             coord_type = glsl_vec_type(src_size);
1919             break;
1920          }
1921          break;
1922       }
1923 
1924       case nir_tex_src_ddx:
1925          lod = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F,
1926                            nir_tex_instr_src_size(instr, i));
1927          break;
1928 
1929       case nir_tex_src_ddy:
1930          lod2 = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F,
1931                            nir_tex_instr_src_size(instr, i));
1932          break;
1933 
1934       case nir_tex_src_lod:
1935          switch (instr->op) {
1936          case nir_texop_txs:
1937          case nir_texop_txf:
1938             lod = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D, 1);
1939             break;
1940 
1941          default:
1942             lod = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F, 1);
1943             break;
1944          }
1945          break;
1946 
1947       case nir_tex_src_ms_index: {
1948          sample_index = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D, 1);
1949          break;
1950       }
1951 
1952       case nir_tex_src_offset:
1953          if (!elk_texture_offset(instr, i, &constant_offset)) {
1954             offset_value =
1955                get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D, 2);
1956          }
1957          break;
1958 
1959       case nir_tex_src_texture_offset: {
1960          assert(texture_reg.is_zero());
1961          texture_reg = emit_uniformize(get_nir_src(instr->src[i].src,
1962                                                    ELK_REGISTER_TYPE_UD, 1));
1963          break;
1964       }
1965 
1966       case nir_tex_src_sampler_offset: {
1967          assert(sampler_reg.is_zero());
1968          sampler_reg = emit_uniformize(get_nir_src(instr->src[i].src,
1969                                                    ELK_REGISTER_TYPE_UD, 1));
1970          break;
1971       }
1972 
1973       case nir_tex_src_projector:
1974          unreachable("Should be lowered by nir_lower_tex");
1975 
1976       case nir_tex_src_bias:
1977          unreachable("LOD bias is not valid for vertex shaders.\n");
1978 
1979       default:
1980          unreachable("unknown texture source");
1981       }
1982    }
1983 
1984    if (instr->op == nir_texop_txf_ms ||
1985        instr->op == nir_texop_samples_identical) {
1986       assert(coord_type != NULL);
1987       if (devinfo->ver >= 7) {
1988          mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg);
1989       } else {
1990          mcs = elk_imm_ud(0u);
1991       }
1992    }
1993 
1994    /* Stuff the channel select bits in the top of the texture offset */
1995    if (instr->op == nir_texop_tg4) {
1996       if (instr->component == 1 &&
1997           (key_tex->gather_channel_quirk_mask & (1 << texture))) {
1998          /* gather4 sampler is broken for green channel on RG32F --
1999           * we must ask for blue instead.
2000           */
2001          constant_offset |= 2 << 16;
2002       } else {
2003          constant_offset |= instr->component << 16;
2004       }
2005    }
2006 
2007    enum elk_opcode opcode;
2008    switch (instr->op) {
2009    case nir_texop_tex:             opcode = ELK_SHADER_OPCODE_TXL;        break;
2010    case nir_texop_txl:             opcode = ELK_SHADER_OPCODE_TXL;        break;
2011    case nir_texop_txd:             opcode = ELK_SHADER_OPCODE_TXD;        break;
2012    case nir_texop_txf:             opcode = ELK_SHADER_OPCODE_TXF;        break;
2013    case nir_texop_txf_ms:          opcode = ELK_SHADER_OPCODE_TXF_CMS;    break;
2014    case nir_texop_txs:             opcode = ELK_SHADER_OPCODE_TXS;        break;
2015    case nir_texop_query_levels:    opcode = ELK_SHADER_OPCODE_TXS;        break;
2016    case nir_texop_texture_samples: opcode = ELK_SHADER_OPCODE_SAMPLEINFO; break;
2017    case nir_texop_tg4:
2018       opcode = offset_value.file != BAD_FILE ? ELK_SHADER_OPCODE_TG4_OFFSET
2019                                              : ELK_SHADER_OPCODE_TG4;
2020       break;
2021    case nir_texop_samples_identical: {
2022       /* There are some challenges implementing this for vec4, and it seems
2023        * unlikely to be used anyway.  For now, just return false ways.
2024        */
2025       emit(MOV(dest, elk_imm_ud(0u)));
2026       return;
2027    }
2028    case nir_texop_txb:
2029    case nir_texop_lod:
2030       unreachable("Implicit LOD is only valid inside fragment shaders.");
2031    default:
2032       unreachable("Unrecognized tex op");
2033    }
2034 
2035    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
2036 
2037    inst->offset = constant_offset;
2038 
2039    /* The message header is necessary for:
2040     * - Gfx4 (always)
2041     * - Texel offsets
2042     * - Gather channel selection
2043     * - Sampler indices too large to fit in a 4-bit value.
2044     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
2045     */
2046    inst->header_size =
2047       (devinfo->ver < 5 ||
2048        inst->offset != 0 ||
2049        opcode == ELK_SHADER_OPCODE_TG4 ||
2050        opcode == ELK_SHADER_OPCODE_TG4_OFFSET ||
2051        opcode == ELK_SHADER_OPCODE_SAMPLEINFO ||
2052        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2053    inst->base_mrf = 2;
2054    inst->mlen = inst->header_size;
2055    inst->dst.writemask = WRITEMASK_XYZW;
2056    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
2057 
2058    inst->src[1] = texture_reg;
2059    inst->src[2] = sampler_reg;
2060 
2061    /* MRF for the first parameter */
2062    int param_base = inst->base_mrf + inst->header_size;
2063 
2064    if (opcode == ELK_SHADER_OPCODE_TXS) {
2065       int writemask = devinfo->ver == 4 ? WRITEMASK_W : WRITEMASK_X;
2066       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2067       inst->mlen++;
2068    } else if (opcode == ELK_SHADER_OPCODE_SAMPLEINFO) {
2069       inst->dst.writemask = WRITEMASK_X;
2070    } else {
2071       /* Load the coordinate */
2072       /* FINISHME: gl_clamp_mask and saturate */
2073       int coord_mask = (1 << instr->coord_components) - 1;
2074       int zero_mask = 0xf & ~coord_mask;
2075 
2076       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2077                coordinate));
2078       inst->mlen++;
2079 
2080       if (zero_mask != 0) {
2081          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2082                   elk_imm_d(0)));
2083       }
2084       /* Load the shadow comparator */
2085       if (shadow_comparator.file != BAD_FILE &&
2086           opcode != ELK_SHADER_OPCODE_TXD &&
2087           opcode != ELK_SHADER_OPCODE_TG4_OFFSET) {
2088 	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
2089 			  WRITEMASK_X),
2090 		  shadow_comparator));
2091 	 inst->mlen++;
2092       }
2093 
2094       /* Load the LOD info */
2095       switch (opcode) {
2096       case ELK_SHADER_OPCODE_TXL: {
2097 	 int mrf, writemask;
2098 	 if (devinfo->ver >= 5) {
2099 	    mrf = param_base + 1;
2100 	    if (shadow_comparator.file != BAD_FILE) {
2101 	       writemask = WRITEMASK_Y;
2102 	       /* mlen already incremented */
2103 	    } else {
2104 	       writemask = WRITEMASK_X;
2105 	       inst->mlen++;
2106 	    }
2107 	 } else /* devinfo->ver == 4 */ {
2108 	    mrf = param_base;
2109 	    writemask = WRITEMASK_W;
2110 	 }
2111 	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2112          break;
2113       }
2114 
2115       case ELK_SHADER_OPCODE_TXF:
2116          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2117          break;
2118 
2119       case ELK_SHADER_OPCODE_TXF_CMS:
2120          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2121                   sample_index));
2122          if (devinfo->ver >= 7) {
2123             /* MCS data is in the first channel of `mcs`, but we need to get it into
2124              * the .y channel of the second vec4 of params, so replicate .x across
2125              * the whole vec4 and then mask off everything except .y
2126              */
2127             mcs.swizzle = ELK_SWIZZLE_XXXX;
2128             emit(MOV(dst_reg(MRF, param_base + 1, glsl_uint_type(), WRITEMASK_Y),
2129                      mcs));
2130          }
2131          inst->mlen++;
2132          break;
2133 
2134       case ELK_SHADER_OPCODE_TXD: {
2135          const elk_reg_type type = lod.type;
2136 
2137 	 if (devinfo->ver >= 5) {
2138 	    lod.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_X,ELK_SWIZZLE_X,ELK_SWIZZLE_Y,ELK_SWIZZLE_Y);
2139 	    lod2.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_X,ELK_SWIZZLE_X,ELK_SWIZZLE_Y,ELK_SWIZZLE_Y);
2140 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2141 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2142 	    inst->mlen++;
2143 
2144 	    if (nir_tex_instr_dest_size(instr) == 3 ||
2145                 shadow_comparator.file != BAD_FILE) {
2146 	       lod.swizzle = ELK_SWIZZLE_ZZZZ;
2147 	       lod2.swizzle = ELK_SWIZZLE_ZZZZ;
2148 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2149 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2150 	       inst->mlen++;
2151 
2152                if (shadow_comparator.file != BAD_FILE) {
2153                   emit(MOV(dst_reg(MRF, param_base + 2,
2154                                    shadow_comparator.type, WRITEMASK_Z),
2155                            shadow_comparator));
2156                }
2157 	    }
2158 	 } else /* devinfo->ver == 4 */ {
2159 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2160 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2161 	    inst->mlen += 2;
2162 	 }
2163          break;
2164       }
2165 
2166       case ELK_SHADER_OPCODE_TG4_OFFSET:
2167          if (shadow_comparator.file != BAD_FILE) {
2168             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
2169                      shadow_comparator));
2170          }
2171 
2172          emit(MOV(dst_reg(MRF, param_base + 1, glsl_ivec2_type(), WRITEMASK_XY),
2173                   offset_value));
2174          inst->mlen++;
2175          break;
2176 
2177       default:
2178          break;
2179       }
2180    }
2181 
2182    emit(inst);
2183 
2184    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2185     * spec requires layers.
2186     */
2187    if (instr->op == nir_texop_txs && devinfo->ver < 7) {
2188       /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
2189       emit_minmax(ELK_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
2190                   src_reg(inst->dst), elk_imm_d(1));
2191    }
2192 
2193    if (instr->op == nir_texop_query_levels) {
2194       /* # levels is in .w */
2195       src_reg swizzled(dest);
2196       swizzled.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_W, ELK_SWIZZLE_W,
2197                                       ELK_SWIZZLE_W, ELK_SWIZZLE_W);
2198       emit(MOV(dest, swizzled));
2199    }
2200 }
2201 
2202 src_reg
emit_mcs_fetch(const glsl_type * coordinate_type,src_reg coordinate,src_reg surface)2203 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2204                              src_reg coordinate, src_reg surface)
2205 {
2206    vec4_instruction *inst =
2207       new(mem_ctx) vec4_instruction(ELK_SHADER_OPCODE_TXF_MCS,
2208                                     dst_reg(this, glsl_uvec4_type()));
2209    inst->base_mrf = 2;
2210    inst->src[1] = surface;
2211    inst->src[2] = elk_imm_ud(0); /* sampler */
2212    inst->mlen = 1;
2213 
2214    const int param_base = inst->base_mrf;
2215 
2216    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2217    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2218    int zero_mask = 0xf & ~coord_mask;
2219 
2220    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2221             coordinate));
2222 
2223    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2224             elk_imm_d(0)));
2225 
2226    emit(inst);
2227    return src_reg(inst->dst);
2228 }
2229 
2230 void
nir_emit_undef(nir_undef_instr * instr)2231 vec4_visitor::nir_emit_undef(nir_undef_instr *instr)
2232 {
2233    nir_ssa_values[instr->def.index] =
2234       dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32)));
2235 }
2236 
2237 /* SIMD4x2 64bit data is stored in register space like this:
2238  *
2239  * r0.0:DF  x0 y0 z0 w0
2240  * r1.0:DF  x1 y1 z1 w1
2241  *
2242  * When we need to write data such as this to memory using 32-bit write
2243  * messages we need to shuffle it in this fashion:
2244  *
2245  * r0.0:DF  x0 y0 x1 y1 (to be written at base offset)
2246  * r0.0:DF  z0 w0 z1 w1 (to be written at base offset + 16)
2247  *
2248  * We need to do the inverse operation when we read using 32-bit messages,
2249  * which we can do by applying the same exact shuffling on the 64-bit data
2250  * read, only that because the data for each vertex is positioned differently
2251  * we need to apply different channel enables.
2252  *
2253  * This function takes 64bit data and shuffles it as explained above.
2254  *
2255  * The @for_write parameter is used to specify if the shuffling is being done
2256  * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit
2257  * write message (for_write = true), or instead we are doing the inverse
2258  * operation and we have just read 64-bit data using a 32-bit messages that we
2259  * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false).
2260  *
2261  * If @block and @ref are non-NULL, then the shuffling is done after @ref,
2262  * otherwise the instructions are emitted normally at the end. The function
2263  * returns the last instruction inserted.
2264  *
2265  * Notice that @src and @dst cannot be the same register.
2266  */
2267 vec4_instruction *
shuffle_64bit_data(dst_reg dst,src_reg src,bool for_write,bool for_scratch,elk_bblock_t * block,vec4_instruction * ref)2268 vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
2269                                  bool for_scratch,
2270                                  elk_bblock_t *block, vec4_instruction *ref)
2271 {
2272    assert(type_sz(src.type) == 8);
2273    assert(type_sz(dst.type) == 8);
2274    assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE));
2275    assert(!ref == !block);
2276 
2277    elk_opcode mov_op = for_scratch ? ELK_VEC4_OPCODE_MOV_FOR_SCRATCH : ELK_OPCODE_MOV;
2278 
2279    const vec4_builder bld = !ref ? vec4_builder(this).at_end() :
2280                                    vec4_builder(this).at(block, ref->next);
2281 
2282    /* Resolve swizzle in src */
2283    if (src.swizzle != ELK_SWIZZLE_XYZW) {
2284       dst_reg data = dst_reg(this, glsl_dvec4_type());
2285       bld.emit(mov_op, data, src);
2286       src = src_reg(data);
2287    }
2288 
2289    /* dst+0.XY = src+0.XY */
2290    bld.group(4, 0).emit(mov_op, writemask(dst, WRITEMASK_XY), src);
2291 
2292    /* dst+0.ZW = src+1.XY */
2293    bld.group(4, for_write ? 1 : 0)
2294             .emit(mov_op, writemask(dst, WRITEMASK_ZW),
2295                   swizzle(byte_offset(src, REG_SIZE), ELK_SWIZZLE_XYXY));
2296 
2297    /* dst+1.XY = src+0.ZW */
2298    bld.group(4, for_write ? 0 : 1)
2299             .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
2300                   swizzle(src, ELK_SWIZZLE_ZWZW));
2301 
2302    /* dst+1.ZW = src+1.ZW */
2303    return bld.group(4, 1)
2304             .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
2305                   byte_offset(src, REG_SIZE));
2306 }
2307 
2308 }
2309