• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2017-2018 Rob Clark <robclark@freedesktop.org>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <robclark@freedesktop.org>
7  */
8 
9 #define GPU 600
10 
11 #include "ir3_context.h"
12 #include "ir3_image.h"
13 
14 /*
15  * Handlers for instructions changed/added in a6xx:
16  *
17  * Starting with a6xx, isam and stbi is used for SSBOs as well; stbi and the
18  * atomic instructions (used for both SSBO and image) use a new instruction
19  * encoding compared to a4xx/a5xx.
20  */
21 
22 static void
lower_ssbo_offset(struct ir3_context * ctx,nir_intrinsic_instr * intr,nir_src * offset_src,struct ir3_instruction ** offset,unsigned * imm_offset)23 lower_ssbo_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr,
24                   nir_src *offset_src,
25                   struct ir3_instruction **offset, unsigned *imm_offset)
26 {
27    if (ctx->compiler->has_ssbo_imm_offsets) {
28       ir3_lower_imm_offset(ctx, intr, offset_src, 7, offset, imm_offset);
29    } else {
30       assert(nir_intrinsic_base(intr) == 0);
31       *offset = ir3_get_src(ctx, offset_src)[0];
32       *imm_offset = 0;
33    }
34 }
35 
36 static void
emit_load_uav(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction * offset,unsigned imm_offset_val,struct ir3_instruction ** dst)37 emit_load_uav(struct ir3_context *ctx, nir_intrinsic_instr *intr,
38               struct ir3_instruction *offset,
39               unsigned imm_offset_val,
40               struct ir3_instruction **dst)
41 {
42    struct ir3_builder *b = &ctx->build;
43    struct ir3_instruction *ldib;
44 
45    struct ir3_instruction *imm_offset = create_immed(b, imm_offset_val);
46 
47    ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0,
48                    imm_offset, 0);
49    ldib->dsts[0]->wrmask = MASK(intr->num_components);
50    ldib->cat6.iim_val = intr->num_components;
51    ldib->cat6.d = reg_elems(offset->dsts[0]);
52    switch (intr->def.bit_size) {
53    case 8:
54       /* This encodes the 8-bit SSBO load and matches blob's encoding of
55        * imageBuffer access using VK_FORMAT_R8 and the dedicated 8-bit
56        * descriptor. No vectorization is possible.
57        */
58       assert(intr->num_components == 1);
59 
60       ldib->cat6.type = TYPE_U16;
61       ldib->cat6.typed = true;
62       break;
63    case 16:
64       ldib->cat6.type = TYPE_U16;
65       break;
66    default:
67       ldib->cat6.type = TYPE_U32;
68       break;
69    }
70    ldib->barrier_class = IR3_BARRIER_BUFFER_R;
71    ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
72 
73    if (imm_offset_val) {
74       assert(ctx->compiler->has_ssbo_imm_offsets);
75       ldib->flags |= IR3_INSTR_IMM_OFFSET;
76    }
77 
78    ir3_handle_bindless_cat6(ldib, intr->src[0]);
79    ir3_handle_nonuniform(ldib, intr);
80 
81    ir3_split_dest(b, dst, ldib, 0, intr->num_components);
82 }
83 
84 /* src[] = { buffer_index, offset }. No const_index */
85 static void
emit_intrinsic_load_ssbo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)86 emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
87                          struct ir3_instruction **dst)
88 {
89    struct ir3_instruction *offset;
90    unsigned imm_offset_val;
91 
92    lower_ssbo_offset(ctx, intr, &intr->src[2], &offset, &imm_offset_val);
93    emit_load_uav(ctx, intr, offset, imm_offset_val, dst);
94 }
95 
96 static void
emit_intrinsic_load_uav(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)97 emit_intrinsic_load_uav(struct ir3_context *ctx, nir_intrinsic_instr *intr,
98                         struct ir3_instruction **dst)
99 {
100    struct ir3_builder *b = &ctx->build;
101    struct ir3_instruction *offset;
102 
103    offset = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[1]), 2);
104 
105    emit_load_uav(ctx, intr, offset, 0, dst);
106 }
107 
108 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
109 static void
emit_intrinsic_store_ssbo(struct ir3_context * ctx,nir_intrinsic_instr * intr)110 emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
111 {
112    struct ir3_builder *b = &ctx->build;
113    struct ir3_instruction *stib, *val, *offset;
114    unsigned wrmask = nir_intrinsic_write_mask(intr);
115    unsigned ncomp = ffs(~wrmask) - 1;
116    unsigned imm_offset_val;
117 
118    assert(wrmask == BITFIELD_MASK(intr->num_components));
119 
120    /* src0 is offset, src1 is immediate offset, src2 is value:
121     */
122    val = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);
123 
124    /* Any 8-bit store will be done on a single-component value that additionally
125     * has to be masked to clear up the higher bits or it will malfunction.
126     */
127    if (intr->src[0].ssa->bit_size == 8) {
128       assert(ncomp == 1);
129 
130       struct ir3_instruction *mask = create_immed_typed(b, 0xff, TYPE_U8);
131       val = ir3_AND_B(b, val, 0, mask, 0);
132       val->dsts[0]->flags |= IR3_REG_HALF;
133    }
134 
135    lower_ssbo_offset(ctx, intr, &intr->src[3], &offset, &imm_offset_val);
136    struct ir3_instruction *imm_offset = create_immed(b, imm_offset_val);
137 
138    stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0,
139                    imm_offset, 0, val, 0);
140    stib->cat6.iim_val = ncomp;
141    stib->cat6.d = 1;
142    switch (intr->src[0].ssa->bit_size) {
143    case 8:
144       /* As with ldib, this encodes the 8-bit SSBO store and matches blob's
145        * encoding of imageBuffer access using VK_FORMAT_R8 and the extra 8-bit
146        * descriptor. No vectorization is possible and we have to override the
147        * relevant field anyway.
148        */
149       stib->cat6.type = TYPE_U16;
150       stib->cat6.iim_val = 4;
151       stib->cat6.typed = true;
152       break;
153    case 16:
154       stib->cat6.type = TYPE_U16;
155       break;
156    default:
157       stib->cat6.type = TYPE_U32;
158       break;
159    }
160    stib->barrier_class = IR3_BARRIER_BUFFER_W;
161    stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
162 
163    if (imm_offset_val) {
164       assert(ctx->compiler->has_ssbo_imm_offsets);
165       stib->flags |= IR3_INSTR_IMM_OFFSET;
166    }
167 
168    ir3_handle_bindless_cat6(stib, intr->src[1]);
169    ir3_handle_nonuniform(stib, intr);
170 
171    array_insert(ctx->block, ctx->block->keeps, stib);
172 }
173 
174 static struct ir3_instruction *
emit_atomic(struct ir3_builder * b,nir_atomic_op op,struct ir3_instruction * ibo,struct ir3_instruction * src0,struct ir3_instruction * src1)175 emit_atomic(struct ir3_builder *b, nir_atomic_op op,
176             struct ir3_instruction *ibo, struct ir3_instruction *src0,
177             struct ir3_instruction *src1)
178 {
179    switch (op) {
180    case nir_atomic_op_iadd:
181       return ir3_ATOMIC_B_ADD(b, ibo, 0, src0, 0, src1, 0);
182    case nir_atomic_op_imin:
183       return ir3_ATOMIC_B_MIN(b, ibo, 0, src0, 0, src1, 0);
184    case nir_atomic_op_umin:
185       return ir3_ATOMIC_B_MIN(b, ibo, 0, src0, 0, src1, 0);
186    case nir_atomic_op_imax:
187       return ir3_ATOMIC_B_MAX(b, ibo, 0, src0, 0, src1, 0);
188    case nir_atomic_op_umax:
189       return ir3_ATOMIC_B_MAX(b, ibo, 0, src0, 0, src1, 0);
190    case nir_atomic_op_iand:
191       return ir3_ATOMIC_B_AND(b, ibo, 0, src0, 0, src1, 0);
192    case nir_atomic_op_ior:
193       return ir3_ATOMIC_B_OR(b, ibo, 0, src0, 0, src1, 0);
194    case nir_atomic_op_ixor:
195       return ir3_ATOMIC_B_XOR(b, ibo, 0, src0, 0, src1, 0);
196    case nir_atomic_op_xchg:
197       return ir3_ATOMIC_B_XCHG(b, ibo, 0, src0, 0, src1, 0);
198    case nir_atomic_op_cmpxchg:
199       return ir3_ATOMIC_B_CMPXCHG(b, ibo, 0, src0, 0, src1, 0);
200    default:
201       unreachable("boo");
202    }
203 }
204 
205 /*
206  * SSBO atomic intrinsics
207  *
208  * All of the SSBO atomic memory operations read a value from memory,
209  * compute a new value using one of the operations below, write the new
210  * value to memory, and return the original value read.
211  *
212  * All operations take 3 sources except CompSwap that takes 4. These
213  * sources represent:
214  *
215  * 0: The SSBO buffer index.
216  * 1: The offset into the SSBO buffer of the variable that the atomic
217  *    operation will operate on.
218  * 2: The data parameter to the atomic function (i.e. the value to add
219  *    in, etc).
220  * 3: For CompSwap only: the second data parameter.
221  */
222 static struct ir3_instruction *
emit_intrinsic_atomic_ssbo(struct ir3_context * ctx,nir_intrinsic_instr * intr)223 emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
224 {
225    struct ir3_builder *b = &ctx->build;
226    struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy;
227    nir_atomic_op op = nir_intrinsic_atomic_op(intr);
228    type_t type = nir_atomic_op_type(op) == nir_type_int ? TYPE_S32 : TYPE_U32;
229    if (intr->def.bit_size == 64) {
230       type = TYPE_ATOMIC_U64;
231    }
232 
233    ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
234 
235    data = ir3_get_src(ctx, &intr->src[2])[0];
236 
237    /* So this gets a bit creative:
238     *
239     *    src0    - vecN offset/coords
240     *    src1.x  - is actually destination register
241     *    src1.y  - is 'data' except for cmpxchg where src2.y is 'compare'
242     *    src1.z  - is 'data' for cmpxchg
243     *
244     * The combining src and dest kinda doesn't work out so well with how
245     * scheduling and RA work. So we create a dummy src2 which is tied to the
246     * destination in RA (i.e. must be allocated to the same vec2/vec3
247     * register) and then immediately extract the first component.
248     *
249     * Note that nir already multiplies the offset by four
250     */
251    dummy = create_immed(b, 0);
252 
253    if (op == nir_atomic_op_cmpxchg) {
254       src0 = ir3_get_src(ctx, &intr->src[4])[0];
255       struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];
256       if (intr->def.bit_size == 64) {
257          struct ir3_instruction *dummy2 = create_immed(b, 0);
258          struct ir3_instruction *compare2 = ir3_get_src(ctx, &intr->src[3])[1];
259          struct ir3_instruction *data2 = ir3_get_src(ctx, &intr->src[2])[1];
260          src1 = ir3_collect(b, dummy, dummy2, compare, compare2, data, data2);
261       } else {
262          src1 = ir3_collect(b, dummy, compare, data);
263       }
264    } else {
265       src0 = ir3_get_src(ctx, &intr->src[3])[0];
266       if (intr->def.bit_size == 64) {
267          struct ir3_instruction *dummy2 = create_immed(b, 0);
268          struct ir3_instruction *data2 = ir3_get_src(ctx, &intr->src[2])[1];
269          src1 = ir3_collect(b, dummy, dummy2, data, data2);
270       } else {
271          src1 = ir3_collect(b, dummy, data);
272       }
273    }
274 
275    atomic = emit_atomic(b, op, ibo, src0, src1);
276    atomic->cat6.iim_val = 1;
277    atomic->cat6.d = 1;
278    atomic->cat6.type = type;
279    atomic->barrier_class = IR3_BARRIER_BUFFER_W;
280    atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
281    ir3_handle_bindless_cat6(atomic, intr->src[0]);
282 
283    /* even if nothing consume the result, we can't DCE the instruction: */
284    array_insert(ctx->block, ctx->block->keeps, atomic);
285 
286    atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
287    ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
288    ir3_handle_nonuniform(atomic, intr);
289 
290    size_t num_results = intr->def.bit_size == 64 ? 2 : 1;
291    struct ir3_instruction *defs[num_results];
292    ir3_split_dest(b, defs, atomic, 0, num_results);
293    return ir3_create_collect(b, defs, num_results);
294   }
295 
296 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
297 static void
emit_intrinsic_load_image(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)298 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
299                           struct ir3_instruction **dst)
300 {
301    struct ir3_builder *b = &ctx->build;
302    struct ir3_instruction *ldib;
303    struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
304    unsigned ncoords = ir3_get_image_coords(intr, NULL);
305 
306    ldib = ir3_LDIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
307                    ir3_create_collect(b, coords, ncoords), 0,
308                    create_immed(b, 0), 0);
309    ldib->dsts[0]->wrmask = MASK(intr->num_components);
310    ldib->cat6.iim_val = intr->num_components;
311    ldib->cat6.d = ncoords;
312    ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
313    ldib->cat6.typed = true;
314    ldib->barrier_class = IR3_BARRIER_IMAGE_R;
315    ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
316    ir3_handle_bindless_cat6(ldib, intr->src[0]);
317    ir3_handle_nonuniform(ldib, intr);
318 
319    ir3_split_dest(b, dst, ldib, 0, intr->num_components);
320 }
321 
322 /* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
323 static void
emit_intrinsic_store_image(struct ir3_context * ctx,nir_intrinsic_instr * intr)324 emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
325 {
326    struct ir3_builder *b = &ctx->build;
327    struct ir3_instruction *stib;
328    struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]);
329    struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
330    unsigned ncoords = ir3_get_image_coords(intr, NULL);
331    enum pipe_format format = nir_intrinsic_format(intr);
332    unsigned ncomp = ir3_get_num_components_for_image_format(format);
333 
334    /* src0 is offset, src1 is value:
335     */
336    stib =
337       ir3_STIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
338                ir3_create_collect(b, coords, ncoords), 0, create_immed(b, 0), 0,
339                ir3_create_collect(b, value, ncomp), 0);
340    stib->cat6.iim_val = ncomp;
341    stib->cat6.d = ncoords;
342    stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
343    stib->cat6.typed = true;
344    stib->barrier_class = IR3_BARRIER_IMAGE_W;
345    stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
346    ir3_handle_bindless_cat6(stib, intr->src[0]);
347    ir3_handle_nonuniform(stib, intr);
348 
349    array_insert(ctx->block, ctx->block->keeps, stib);
350 }
351 
352 /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
353 static struct ir3_instruction *
emit_intrinsic_atomic_image(struct ir3_context * ctx,nir_intrinsic_instr * intr)354 emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
355 {
356    struct ir3_builder *b = &ctx->build;
357    struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy;
358    struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
359    struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0];
360    unsigned ncoords = ir3_get_image_coords(intr, NULL);
361    nir_atomic_op op = nir_intrinsic_atomic_op(intr);
362 
363    ibo = ir3_image_to_ibo(ctx, intr->src[0]);
364 
365    /* So this gets a bit creative:
366     *
367     *    src0    - vecN offset/coords
368     *    src1.x  - is actually destination register
369     *    src1.y  - is 'value' except for cmpxchg where src2.y is 'compare'
370     *    src1.z  - is 'value' for cmpxchg
371     *
372     * The combining src and dest kinda doesn't work out so well with how
373     * scheduling and RA work. So we create a dummy src2 which is tied to the
374     * destination in RA (i.e. must be allocated to the same vec2/vec3
375     * register) and then immediately extract the first component.
376     */
377    dummy = create_immed(b, 0);
378    src0 = ir3_create_collect(b, coords, ncoords);
379 
380    if (op == nir_atomic_op_cmpxchg) {
381       struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];
382       src1 = ir3_collect(b, dummy, compare, value);
383    } else {
384       src1 = ir3_collect(b, dummy, value);
385    }
386 
387    atomic = emit_atomic(b, op, ibo, src0, src1);
388    atomic->cat6.iim_val = 1;
389    atomic->cat6.d = ncoords;
390    atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
391    atomic->cat6.typed = true;
392    atomic->barrier_class = IR3_BARRIER_IMAGE_W;
393    atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
394    ir3_handle_bindless_cat6(atomic, intr->src[0]);
395 
396    /* even if nothing consume the result, we can't DCE the instruction: */
397    array_insert(ctx->block, ctx->block->keeps, atomic);
398 
399    atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
400    ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
401    ir3_handle_nonuniform(atomic, intr);
402    struct ir3_instruction *split;
403    ir3_split_dest(b, &split, atomic, 0, 1);
404    return split;
405 }
406 
407 static void
emit_intrinsic_image_size(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)408 emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
409                           struct ir3_instruction **dst)
410 {
411    struct ir3_builder *b = &ctx->build;
412    struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
413    struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
414    resinfo->cat6.iim_val = 1;
415    resinfo->cat6.d = intr->num_components;
416    resinfo->cat6.type = TYPE_U32;
417    resinfo->cat6.typed = false;
418    /* resinfo has no writemask and always writes out 3 components: */
419    compile_assert(ctx, intr->num_components <= 3);
420    resinfo->dsts[0]->wrmask = MASK(3);
421    ir3_handle_bindless_cat6(resinfo, intr->src[0]);
422    ir3_handle_nonuniform(resinfo, intr);
423 
424    ir3_split_dest(b, dst, resinfo, 0, intr->num_components);
425 }
426 
427 static void
emit_intrinsic_load_global_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)428 emit_intrinsic_load_global_ir3(struct ir3_context *ctx,
429                                nir_intrinsic_instr *intr,
430                                struct ir3_instruction **dst)
431 {
432    struct ir3_builder *b = &ctx->build;
433    unsigned dest_components = nir_intrinsic_dest_components(intr);
434    struct ir3_instruction *addr, *offset;
435 
436    addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0],
437                       ir3_get_src(ctx, &intr->src[0])[1]);
438 
439    struct ir3_instruction *load;
440 
441    bool const_offset_in_bounds =
442       nir_src_is_const(intr->src[1]) &&
443       nir_src_as_int(intr->src[1]) < (1 << 8) &&
444       nir_src_as_int(intr->src[1]) > -(1 << 8);
445 
446    if (const_offset_in_bounds) {
447       load = ir3_LDG(b, addr, 0,
448                      create_immed(b, nir_src_as_int(intr->src[1]) * 4),
449                      0, create_immed(b, dest_components), 0);
450    } else {
451       unsigned shift = ctx->compiler->gen >= 7 ? 2 : 0;
452       offset = ir3_get_src(ctx, &intr->src[1])[0];
453       if (shift) {
454          /* A7XX TODO: Move to NIR for it to be properly optimized? */
455          offset = ir3_SHL_B(b, offset, 0, create_immed(b, shift), 0);
456       }
457       load =
458          ir3_LDG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
459                    create_immed(b, 0), 0, create_immed(b, dest_components), 0);
460    }
461 
462    load->cat6.type = type_uint_size(intr->def.bit_size);
463    load->dsts[0]->wrmask = MASK(dest_components);
464 
465    load->barrier_class = IR3_BARRIER_BUFFER_R;
466    load->barrier_conflict = IR3_BARRIER_BUFFER_W;
467 
468    ir3_split_dest(b, dst, load, 0, dest_components);
469 }
470 
471 static void
emit_intrinsic_store_global_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr)472 emit_intrinsic_store_global_ir3(struct ir3_context *ctx,
473                                 nir_intrinsic_instr *intr)
474 {
475    struct ir3_builder *b = &ctx->build;
476    struct ir3_instruction *value, *addr, *offset;
477    unsigned ncomp = nir_intrinsic_src_components(intr, 0);
478 
479    addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[1])[0],
480                       ir3_get_src(ctx, &intr->src[1])[1]);
481 
482    value = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);
483 
484    struct ir3_instruction *stg;
485 
486    bool const_offset_in_bounds = nir_src_is_const(intr->src[2]) &&
487                                  nir_src_as_int(intr->src[2]) < (1 << 10) &&
488                                  nir_src_as_int(intr->src[2]) > -(1 << 10);
489 
490    if (const_offset_in_bounds) {
491       stg = ir3_STG(b, addr, 0,
492                     create_immed(b, nir_src_as_int(intr->src[2]) * 4), 0,
493                     value, 0,
494                     create_immed(b, ncomp), 0);
495    } else {
496       offset = ir3_get_src(ctx, &intr->src[2])[0];
497       if (ctx->compiler->gen >= 7) {
498          /* A7XX TODO: Move to NIR for it to be properly optimized? */
499          offset = ir3_SHL_B(b, offset, 0, create_immed(b, 2), 0);
500       }
501       stg =
502          ir3_STG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
503                    create_immed(b, 0), 0, value, 0, create_immed(b, ncomp), 0);
504    }
505 
506    stg->cat6.type = type_uint_size(intr->src[0].ssa->bit_size);
507    stg->cat6.iim_val = 1;
508 
509    array_insert(ctx->block, ctx->block->keeps, stg);
510 
511    stg->barrier_class = IR3_BARRIER_BUFFER_W;
512    stg->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
513 }
514 
515 static struct ir3_instruction *
emit_intrinsic_atomic_global(struct ir3_context * ctx,nir_intrinsic_instr * intr)516 emit_intrinsic_atomic_global(struct ir3_context *ctx, nir_intrinsic_instr *intr)
517 {
518    struct ir3_builder *b = &ctx->build;
519    struct ir3_instruction *addr, *atomic, *src1;
520    struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[1])[0];
521    nir_atomic_op op = nir_intrinsic_atomic_op(intr);
522    type_t type = nir_atomic_op_type(op) == nir_type_int ? TYPE_S32 : TYPE_U32;
523    if (intr->def.bit_size == 64) {
524       type = TYPE_ATOMIC_U64;
525    }
526 
527    addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0],
528                       ir3_get_src(ctx, &intr->src[0])[1]);
529 
530    if (op == nir_atomic_op_cmpxchg) {
531       struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[2])[0];
532       src1 = ir3_collect(b, compare, value);
533       if (intr->def.bit_size == 64) {
534          struct ir3_instruction *compare2 = ir3_get_src(ctx, &intr->src[2])[1];
535          struct ir3_instruction *value2 = ir3_get_src(ctx, &intr->src[1])[1];
536          src1 = ir3_collect(b, compare, compare2, value, value2);
537       } else {
538          src1 = ir3_collect(b, compare, value);
539       }
540    } else {
541       if (intr->def.bit_size == 64) {
542          struct ir3_instruction *value2 = ir3_get_src(ctx, &intr->src[1])[1];
543          src1 = ir3_collect(b, value, value2);
544       } else {
545          src1 = value;
546       }
547    }
548 
549    switch (op) {
550    case nir_atomic_op_iadd:
551       atomic = ir3_ATOMIC_G_ADD(b, addr, 0, src1, 0);
552       break;
553    case nir_atomic_op_imin:
554       atomic = ir3_ATOMIC_G_MIN(b, addr, 0, src1, 0);
555       type = TYPE_S32;
556       break;
557    case nir_atomic_op_umin:
558       atomic = ir3_ATOMIC_G_MIN(b, addr, 0, src1, 0);
559       break;
560    case nir_atomic_op_imax:
561       atomic = ir3_ATOMIC_G_MAX(b, addr, 0, src1, 0);
562       type = TYPE_S32;
563       break;
564    case nir_atomic_op_umax:
565       atomic = ir3_ATOMIC_G_MAX(b, addr, 0, src1, 0);
566       break;
567    case nir_atomic_op_iand:
568       atomic = ir3_ATOMIC_G_AND(b, addr, 0, src1, 0);
569       break;
570    case nir_atomic_op_ior:
571       atomic = ir3_ATOMIC_G_OR(b, addr, 0, src1, 0);
572       break;
573    case nir_atomic_op_ixor:
574       atomic = ir3_ATOMIC_G_XOR(b, addr, 0, src1, 0);
575       break;
576    case nir_atomic_op_xchg:
577       atomic = ir3_ATOMIC_G_XCHG(b, addr, 0, src1, 0);
578       break;
579    case nir_atomic_op_cmpxchg:
580       atomic = ir3_ATOMIC_G_CMPXCHG(b, addr, 0, src1, 0);
581       break;
582    default:
583       unreachable("Unknown global atomic op");
584    }
585 
586    atomic->cat6.iim_val = 1;
587    atomic->cat6.d = 1;
588    atomic->cat6.type = type;
589    atomic->barrier_class = IR3_BARRIER_BUFFER_W;
590    atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
591    atomic->dsts[0]->wrmask = MASK(intr->def.bit_size == 64 ? 2 : 1);
592 
593    /* even if nothing consume the result, we can't DCE the instruction: */
594    array_insert(ctx->block, ctx->block->keeps, atomic);
595 
596    return atomic;
597 }
598 
599 const struct ir3_context_funcs ir3_a6xx_funcs = {
600    .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
601    .emit_intrinsic_load_uav = emit_intrinsic_load_uav,
602    .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
603    .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
604    .emit_intrinsic_load_image = emit_intrinsic_load_image,
605    .emit_intrinsic_store_image = emit_intrinsic_store_image,
606    .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
607    .emit_intrinsic_image_size = emit_intrinsic_image_size,
608    .emit_intrinsic_load_global_ir3 = emit_intrinsic_load_global_ir3,
609    .emit_intrinsic_store_global_ir3 = emit_intrinsic_store_global_ir3,
610    .emit_intrinsic_atomic_global = emit_intrinsic_atomic_global,
611 };
612