• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016-2018 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3d_compiler.h"
25 
26 /* We don't do any address packing. */
27 #define __gen_user_data void
28 #define __gen_address_type uint32_t
29 #define __gen_address_offset(reloc) (*reloc)
30 #define __gen_emit_reloc(cl, reloc)
31 #include "cle/v3d_packet_v42_pack.h"
32 
33 static inline struct qinst *
vir_TMU_WRITE(struct v3d_compile * c,enum v3d_qpu_waddr waddr,struct qreg val)34 vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
35 {
36         /* XXX perf: We should figure out how to merge ALU operations
37          * producing the val with this MOV, when possible.
38          */
39         return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
40 }
41 
42 static inline struct qinst *
vir_TMU_WRITE_or_count(struct v3d_compile * c,enum v3d_qpu_waddr waddr,struct qreg val,uint32_t * tmu_writes)43 vir_TMU_WRITE_or_count(struct v3d_compile *c,
44                        enum v3d_qpu_waddr waddr,
45                        struct qreg val,
46                        uint32_t *tmu_writes)
47 {
48         if (tmu_writes) {
49                 (*tmu_writes)++;
50                 return NULL;
51         } else {
52                 return vir_TMU_WRITE(c, waddr, val);
53         }
54 }
55 
56 static void
vir_WRTMUC(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)57 vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data)
58 {
59         struct qinst *inst = vir_NOP(c);
60         inst->qpu.sig.wrtmuc = true;
61         inst->uniform = vir_get_uniform_index(c, contents, data);
62 }
63 
64 static const struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
65         .per_pixel_mask_enable = true,
66 };
67 
68 static const struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
69         .op = V3D_TMU_OP_REGULAR,
70 };
71 
72 /**
73  * If 'tmu_writes' is not NULL, then it just counts required register writes,
74  * otherwise, it emits the actual register writes.
75  *
76  * It is important to notice that emitting register writes for the current
77  * TMU operation may trigger a TMU flush, since it is possible that any
78  * of the inputs required for the register writes is the result of a pending
79  * TMU operation. If that happens we need to make sure that it doesn't happen
80  * in the middle of the TMU register writes for the current TMU operation,
81  * which is why we always call ntq_get_src() even if we are only interested in
82  * register write counts.
83  */
84 static void
handle_tex_src(struct v3d_compile * c,nir_tex_instr * instr,unsigned src_idx,unsigned non_array_components,struct V3D42_TMU_CONFIG_PARAMETER_2 * p2_unpacked,struct qreg * s_out,unsigned * tmu_writes)85 handle_tex_src(struct v3d_compile *c,
86                nir_tex_instr *instr,
87                unsigned src_idx,
88                unsigned non_array_components,
89                struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
90                struct qreg *s_out,
91                unsigned *tmu_writes)
92 {
93         /* Either we are calling this just to count required TMU writes, or we
94          * are calling this to emit the actual TMU writes.
95          */
96         assert(tmu_writes || (s_out && p2_unpacked));
97 
98         struct qreg s;
99         switch (instr->src[src_idx].src_type) {
100         case nir_tex_src_coord:
101                 /* S triggers the lookup, so save it for the end. */
102                 s = ntq_get_src(c, instr->src[src_idx].src, 0);
103                 if (tmu_writes)
104                         (*tmu_writes)++;
105                 else
106                         *s_out = s;
107 
108                 if (non_array_components > 1) {
109                         struct qreg src =
110                                 ntq_get_src(c, instr->src[src_idx].src, 1);
111                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src,
112                                                 tmu_writes);
113                 }
114 
115                 if (non_array_components > 2) {
116                         struct qreg src =
117                                 ntq_get_src(c, instr->src[src_idx].src, 2);
118                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUR, src,
119                                                tmu_writes);
120                 }
121 
122                 if (instr->is_array) {
123                         struct qreg src =
124                                 ntq_get_src(c, instr->src[src_idx].src,
125                                             instr->coord_components - 1);
126                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUI, src,
127                                                tmu_writes);
128                 }
129                 break;
130 
131         case nir_tex_src_bias: {
132                 struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
133                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUB, src, tmu_writes);
134                 break;
135         }
136 
137         case nir_tex_src_lod: {
138                 struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
139                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUB, src, tmu_writes);
140                 if (!tmu_writes) {
141                         /* With texel fetch automatic LOD is already disabled,
142                          * and disable_autolod must not be enabled. For
143                          * non-cubes we can use the register TMUSLOD, that
144                          * implicitly sets disable_autolod.
145                          */
146                         assert(p2_unpacked);
147                         if (instr->op != nir_texop_txf &&
148                             instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
149                                     p2_unpacked->disable_autolod = true;
150                         }
151                }
152                break;
153         }
154 
155         case nir_tex_src_comparator: {
156                 struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
157                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUDREF, src, tmu_writes);
158                 break;
159         }
160 
161         case nir_tex_src_offset: {
162                 bool is_const_offset = nir_src_is_const(instr->src[src_idx].src);
163                 if (is_const_offset) {
164                         if (!tmu_writes) {
165                                 p2_unpacked->offset_s =
166                                         nir_src_comp_as_int(instr->src[src_idx].src, 0);
167                                 if (non_array_components >= 2)
168                                         p2_unpacked->offset_t =
169                                                 nir_src_comp_as_int(instr->src[src_idx].src, 1);
170                                 if (non_array_components >= 3)
171                                         p2_unpacked->offset_r =
172                                                 nir_src_comp_as_int(instr->src[src_idx].src, 2);
173                         }
174                 } else {
175                         struct qreg src_0 =
176                                 ntq_get_src(c, instr->src[src_idx].src, 0);
177                         struct qreg src_1 =
178                                 ntq_get_src(c, instr->src[src_idx].src, 1);
179                         if (!tmu_writes) {
180                                 struct qreg mask = vir_uniform_ui(c, 0xf);
181                                 struct qreg x, y, offset;
182 
183                                 x = vir_AND(c, src_0, mask);
184                                 y = vir_AND(c, src_1, mask);
185                                 offset = vir_OR(c, x,
186                                                 vir_SHL(c, y, vir_uniform_ui(c, 4)));
187 
188                                 vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUOFF, offset);
189                         } else {
190                                 (*tmu_writes)++;
191                         }
192                 }
193                 break;
194         }
195 
196         default:
197                 unreachable("unknown texture source");
198         }
199 }
200 
201 static void
vir_tex_handle_srcs(struct v3d_compile * c,nir_tex_instr * instr,struct V3D42_TMU_CONFIG_PARAMETER_2 * p2_unpacked,struct qreg * s,unsigned * tmu_writes)202 vir_tex_handle_srcs(struct v3d_compile *c,
203                     nir_tex_instr *instr,
204                     struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
205                     struct qreg *s,
206                     unsigned *tmu_writes)
207 {
208         unsigned non_array_components = instr->op != nir_texop_lod ?
209                 instr->coord_components - instr->is_array :
210                 instr->coord_components;
211 
212         for (unsigned i = 0; i < instr->num_srcs; i++) {
213                 handle_tex_src(c, instr, i, non_array_components,
214                                p2_unpacked, s, tmu_writes);
215         }
216 }
217 
218 static unsigned
get_required_tex_tmu_writes(struct v3d_compile * c,nir_tex_instr * instr)219 get_required_tex_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr)
220 {
221         unsigned tmu_writes = 0;
222         vir_tex_handle_srcs(c, instr, NULL, NULL, &tmu_writes);
223         return tmu_writes;
224 }
225 
226 void
v3d_vir_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)227 v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
228 {
229         unsigned texture_idx = instr->texture_index;
230 
231         /* For instructions that don't have a sampler (i.e. txf) we bind
232          * default sampler state via the backend_flags to handle precision.
233          */
234         unsigned sampler_idx = nir_tex_instr_need_sampler(instr) ?
235                                instr->sampler_index : instr->backend_flags;
236 
237         /* Even if the texture operation doesn't need a sampler by
238          * itself, we still need to add the sampler configuration
239          * parameter if the output is 32 bit
240          */
241         assert(sampler_idx < c->key->num_samplers_used);
242         bool output_type_32_bit =
243                 c->key->sampler[sampler_idx].return_size == 32;
244 
245         struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
246         };
247 
248         /* Limit the number of channels returned to both how many the NIR
249          * instruction writes and how many the instruction could produce.
250          */
251         nir_intrinsic_instr *store = nir_store_reg_for_def(&instr->def);
252         if (store == NULL) {
253                 p0_unpacked.return_words_of_texture_data =
254                         nir_def_components_read(&instr->def);
255         } else {
256                 nir_def *reg = store->src[1].ssa;
257                 nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
258                 unsigned reg_num_components =
259                         nir_intrinsic_num_components(decl);
260 
261                 /* For the non-ssa case we don't have a full equivalent to
262                  * nir_def_components_read. This is a problem for the 16
263                  * bit case. nir_lower_tex will not change the destination as
264                  * nir_tex_instr_dest_size will still return 4. The driver is
265                  * just expected to not store on other channels, so we
266                  * manually ensure that here.
267                  */
268                 uint32_t num_components = output_type_32_bit ?
269                         MIN2(reg_num_components, 4) :
270                         MIN2(reg_num_components, 2);
271 
272                 p0_unpacked.return_words_of_texture_data = (1 << num_components) - 1;
273         }
274         assert(p0_unpacked.return_words_of_texture_data != 0);
275 
276         struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
277                 .op = V3D_TMU_OP_REGULAR,
278                 .gather_mode = instr->op == nir_texop_tg4,
279                 .gather_component = instr->component,
280                 .coefficient_mode = instr->op == nir_texop_txd,
281                 .disable_autolod = instr->op == nir_texop_tg4,
282                 .lod_query = instr->op == nir_texop_lod,
283         };
284 
285         const unsigned tmu_writes = get_required_tex_tmu_writes(c, instr);
286 
287         /* The input FIFO has 16 slots across all threads so if we require
288          * more than that we need to lower thread count.
289          */
290         while (tmu_writes > 16 / c->threads)
291                 c->threads /= 2;
292 
293        /* If pipelining this TMU operation would overflow TMU fifos, we need
294         * to flush any outstanding TMU operations.
295         */
296         const unsigned dest_components =
297            util_bitcount(p0_unpacked.return_words_of_texture_data);
298         if (ntq_tmu_fifo_overflow(c, dest_components))
299                 ntq_flush_tmu(c);
300 
301         /* Process tex sources emitting corresponding TMU writes */
302         struct qreg s = { };
303         vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL);
304 
305         uint32_t p0_packed;
306         V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
307                                           (uint8_t *)&p0_packed,
308                                           &p0_unpacked);
309 
310         uint32_t p2_packed;
311         V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
312                                           (uint8_t *)&p2_packed,
313                                           &p2_unpacked);
314 
315         /* Load texture_idx number into the high bits of the texture address field,
316          * which will be be used by the driver to decide which texture to put
317          * in the actual address field.
318          */
319         p0_packed |= texture_idx << 24;
320 
321         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
322 
323         /* p1 is optional, but we can skip it only if p2 can be skipped too */
324         bool needs_p2_config =
325                 (instr->op == nir_texop_lod ||
326                  memcmp(&p2_unpacked, &p2_unpacked_default,
327                         sizeof(p2_unpacked)) != 0);
328 
329         /* To handle the cases were we can't just use p1_unpacked_default */
330         bool non_default_p1_config = nir_tex_instr_need_sampler(instr) ||
331                 output_type_32_bit;
332 
333         if (non_default_p1_config) {
334                 struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
335                         .output_type_32_bit = output_type_32_bit,
336 
337                         .unnormalized_coordinates = (instr->sampler_dim ==
338                                                      GLSL_SAMPLER_DIM_RECT),
339                 };
340 
341                 /* Word enables can't ask for more channels than the
342                  * output type could provide (2 for f16, 4 for
343                  * 32-bit).
344                  */
345                 assert(!p1_unpacked.output_type_32_bit ||
346                        p0_unpacked.return_words_of_texture_data < (1 << 4));
347                 assert(p1_unpacked.output_type_32_bit ||
348                        p0_unpacked.return_words_of_texture_data < (1 << 2));
349 
350                 uint32_t p1_packed;
351                 V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
352                                                   (uint8_t *)&p1_packed,
353                                                   &p1_unpacked);
354 
355                 if (nir_tex_instr_need_sampler(instr)) {
356                         /* Load sampler_idx number into the high bits of the
357                          * sampler address field, which will be be used by the
358                          * driver to decide which sampler to put in the actual
359                          * address field.
360                          */
361                         p1_packed |= sampler_idx << 24;
362 
363                         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed);
364                 } else {
365                         /* In this case, we don't need to merge in any
366                          * sampler state from the API and can just use
367                          * our packed bits */
368                         vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed);
369                 }
370         } else if (needs_p2_config) {
371                 /* Configuration parameters need to be set up in
372                  * order, and if P2 is needed, you need to set up P1
373                  * too even if sampler info is not needed by the
374                  * texture operation. But we can set up default info,
375                  * and avoid asking the driver for the sampler state
376                  * address
377                  */
378                 uint32_t p1_packed_default;
379                 V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
380                                                   (uint8_t *)&p1_packed_default,
381                                                   &p1_unpacked_default);
382                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed_default);
383         }
384 
385         if (needs_p2_config)
386                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
387 
388         /* Emit retiring TMU write */
389         struct qinst *retiring;
390         if (instr->op == nir_texop_txf) {
391                 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
392                 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
393         } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
394                 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
395         } else if (instr->op == nir_texop_txl) {
396                 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
397         } else {
398                 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
399         }
400 
401         retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
402         ntq_add_pending_tmu_flush(c, &instr->def,
403                                   p0_unpacked.return_words_of_texture_data);
404 }
405 
406 static uint32_t
v3d_image_atomic_tmu_op(nir_intrinsic_instr * instr)407 v3d_image_atomic_tmu_op(nir_intrinsic_instr *instr)
408 {
409         nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
410         switch (atomic_op) {
411         case nir_atomic_op_iadd:    return v3d_get_op_for_atomic_add(instr, 3);
412         case nir_atomic_op_imin:    return V3D_TMU_OP_WRITE_SMIN;
413         case nir_atomic_op_umin:    return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
414         case nir_atomic_op_imax:    return V3D_TMU_OP_WRITE_SMAX;
415         case nir_atomic_op_umax:    return V3D_TMU_OP_WRITE_UMAX;
416         case nir_atomic_op_iand:    return V3D_TMU_OP_WRITE_AND_READ_INC;
417         case nir_atomic_op_ior:     return V3D_TMU_OP_WRITE_OR_READ_DEC;
418         case nir_atomic_op_ixor:    return V3D_TMU_OP_WRITE_XOR_READ_NOT;
419         case nir_atomic_op_xchg:    return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
420         case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
421         default:                    unreachable("unknown atomic op");
422         }
423 }
424 
425 static uint32_t
v3d_image_load_store_tmu_op(nir_intrinsic_instr * instr)426 v3d_image_load_store_tmu_op(nir_intrinsic_instr *instr)
427 {
428         switch (instr->intrinsic) {
429         case nir_intrinsic_image_load:
430         case nir_intrinsic_image_store:
431                 return V3D_TMU_OP_REGULAR;
432 
433         case nir_intrinsic_image_atomic:
434         case nir_intrinsic_image_atomic_swap:
435                 return v3d_image_atomic_tmu_op(instr);
436 
437         default:
438                 unreachable("unknown image intrinsic");
439         };
440 }
441 
442 /**
443  * If 'tmu_writes' is not NULL, then it just counts required register writes,
444  * otherwise, it emits the actual register writes.
445  *
446  * It is important to notice that emitting register writes for the current
447  * TMU operation may trigger a TMU flush, since it is possible that any
448  * of the inputs required for the register writes is the result of a pending
449  * TMU operation. If that happens we need to make sure that it doesn't happen
450  * in the middle of the TMU register writes for the current TMU operation,
451  * which is why we always call ntq_get_src() even if we are only interested in
452  * register write counts.
453  */
454 static struct qinst *
vir_image_emit_register_writes(struct v3d_compile * c,nir_intrinsic_instr * instr,bool atomic_add_replaced,uint32_t * tmu_writes)455 vir_image_emit_register_writes(struct v3d_compile *c,
456                                nir_intrinsic_instr *instr,
457                                bool atomic_add_replaced,
458                                uint32_t *tmu_writes)
459 {
460         if (tmu_writes)
461                 *tmu_writes = 0;
462 
463         bool is_1d = false;
464         switch (nir_intrinsic_image_dim(instr)) {
465         case GLSL_SAMPLER_DIM_1D:
466                 is_1d = true;
467                 break;
468         case GLSL_SAMPLER_DIM_BUF:
469                 break;
470         case GLSL_SAMPLER_DIM_2D:
471         case GLSL_SAMPLER_DIM_RECT:
472         case GLSL_SAMPLER_DIM_CUBE: {
473                 struct qreg src = ntq_get_src(c, instr->src[1], 1);
474                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src, tmu_writes);
475                 break;
476         }
477         case GLSL_SAMPLER_DIM_3D: {
478                 struct qreg src_1_1 = ntq_get_src(c, instr->src[1], 1);
479                 struct qreg src_1_2 = ntq_get_src(c, instr->src[1], 2);
480                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src_1_1, tmu_writes);
481                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUR, src_1_2, tmu_writes);
482                 break;
483         }
484         default:
485                 unreachable("bad image sampler dim");
486         }
487 
488         /* In order to fetch on a cube map, we need to interpret it as
489          * 2D arrays, where the third coord would be the face index.
490          */
491         if (nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE ||
492             nir_intrinsic_image_array(instr)) {
493                 struct qreg src = ntq_get_src(c, instr->src[1], is_1d ? 1 : 2);
494                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUI, src, tmu_writes);
495         }
496 
497         /* Emit the data writes for atomics or image store. */
498         if (instr->intrinsic != nir_intrinsic_image_load &&
499             !atomic_add_replaced) {
500                 for (int i = 0; i < nir_intrinsic_src_components(instr, 3); i++) {
501                         struct qreg src_3_i = ntq_get_src(c, instr->src[3], i);
502                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_3_i,
503                                                tmu_writes);
504                 }
505 
506                 /* Second atomic argument */
507                 if (instr->intrinsic == nir_intrinsic_image_atomic_swap &&
508                     nir_intrinsic_atomic_op(instr) == nir_atomic_op_cmpxchg) {
509                         struct qreg src_4_0 = ntq_get_src(c, instr->src[4], 0);
510                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_4_0,
511                                                tmu_writes);
512                 }
513         }
514 
515         struct qreg src_1_0 = ntq_get_src(c, instr->src[1], 0);
516         if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
517             instr->intrinsic != nir_intrinsic_image_load) {
518                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
519                            V3D_QPU_PF_PUSHZ);
520         }
521 
522         struct qinst *retiring =
523                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
524 
525         if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
526             instr->intrinsic != nir_intrinsic_image_load) {
527                 struct qinst *last_inst =
528                         (struct  qinst *)c->cur_block->instructions.prev;
529                 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
530         }
531 
532         return retiring;
533 }
534 
535 static unsigned
get_required_image_tmu_writes(struct v3d_compile * c,nir_intrinsic_instr * instr,bool atomic_add_replaced)536 get_required_image_tmu_writes(struct v3d_compile *c,
537                               nir_intrinsic_instr *instr,
538                               bool atomic_add_replaced)
539 {
540         unsigned tmu_writes;
541         vir_image_emit_register_writes(c, instr, atomic_add_replaced,
542                                        &tmu_writes);
543         return tmu_writes;
544 }
545 
546 void
v3d_vir_emit_image_load_store(struct v3d_compile * c,nir_intrinsic_instr * instr)547 v3d_vir_emit_image_load_store(struct v3d_compile *c,
548                               nir_intrinsic_instr *instr)
549 {
550         unsigned format = nir_intrinsic_format(instr);
551         unsigned unit = nir_src_as_uint(instr->src[0]);
552 
553         struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
554         };
555 
556         struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
557                 .per_pixel_mask_enable = true,
558                 .output_type_32_bit = v3d_gl_format_is_return_32(format),
559         };
560 
561         struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
562 
563         /* Limit the number of channels returned to both how many the NIR
564          * instruction writes and how many the instruction could produce.
565          */
566         uint32_t instr_return_channels = nir_intrinsic_dest_components(instr);
567         if (!p1_unpacked.output_type_32_bit)
568                 instr_return_channels = (instr_return_channels + 1) / 2;
569 
570         p0_unpacked.return_words_of_texture_data =
571                 (1 << instr_return_channels) - 1;
572 
573         p2_unpacked.op = v3d_image_load_store_tmu_op(instr);
574 
575         /* If we were able to replace atomic_add for an inc/dec, then we
576          * need/can to do things slightly different, like not loading the
577          * amount to add/sub, as that is implicit.
578          */
579         bool atomic_add_replaced =
580                 instr->intrinsic == nir_intrinsic_image_atomic &&
581                 nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
582                 (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
583                  p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC);
584 
585         uint32_t p0_packed;
586         V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
587                                           (uint8_t *)&p0_packed,
588                                           &p0_unpacked);
589 
590         /* Load unit number into the high bits of the texture or sampler
591          * address field, which will be be used by the driver to decide which
592          * texture to put in the actual address field.
593          */
594         p0_packed |= unit << 24;
595 
596         uint32_t p1_packed;
597         V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
598                                           (uint8_t *)&p1_packed,
599                                           &p1_unpacked);
600 
601         uint32_t p2_packed;
602         V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
603                                           (uint8_t *)&p2_packed,
604                                           &p2_unpacked);
605 
606         if (instr->intrinsic != nir_intrinsic_image_load)
607                 c->tmu_dirty_rcl = true;
608 
609 
610         const uint32_t tmu_writes =
611                 get_required_image_tmu_writes(c, instr, atomic_add_replaced);
612 
613         /* The input FIFO has 16 slots across all threads so if we require
614          * more than that we need to lower thread count.
615          */
616         while (tmu_writes > 16 / c->threads)
617                 c->threads /= 2;
618 
619        /* If pipelining this TMU operation would overflow TMU fifos, we need
620         * to flush any outstanding TMU operations.
621         */
622         if (ntq_tmu_fifo_overflow(c, instr_return_channels))
623                 ntq_flush_tmu(c);
624 
625         vir_WRTMUC(c, QUNIFORM_IMAGE_TMU_CONFIG_P0, p0_packed);
626         if (memcmp(&p1_unpacked, &p1_unpacked_default, sizeof(p1_unpacked)))
627                    vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed);
628         if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
629                    vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
630 
631         struct qinst *retiring =
632                 vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
633         retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
634         ntq_add_pending_tmu_flush(c, &instr->def,
635                                   p0_unpacked.return_words_of_texture_data);
636 }
637