• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_pipe.h"
8 #include "sid.h"
9 #include "si_build_pm4.h"
10 
11 /* Set this if you want the ME to wait until CP DMA is done.
12  * It should be set on the last CP DMA packet. */
13 #define CP_DMA_SYNC (1 << 0)
14 
15 /* Set this if the source data was used as a destination in a previous CP DMA
16  * packet. It's for preventing a read-after-write (RAW) hazard between two
17  * CP DMA packets. */
18 #define CP_DMA_RAW_WAIT    (1 << 1)
19 #define CP_DMA_DST_IS_GDS  (1 << 2)
20 #define CP_DMA_CLEAR       (1 << 3)
21 #define CP_DMA_PFP_SYNC_ME (1 << 4)
22 #define CP_DMA_SRC_IS_GDS  (1 << 5)
23 
24 /* The max number of bytes that can be copied per packet. */
cp_dma_max_byte_count(struct si_context * sctx)25 static inline unsigned cp_dma_max_byte_count(struct si_context *sctx)
26 {
27    unsigned max =
28       sctx->gfx_level >= GFX11 ? 32767 :
29       sctx->gfx_level >= GFX9 ? S_415_BYTE_COUNT_GFX9(~0u) : S_415_BYTE_COUNT_GFX6(~0u);
30 
31    /* make it aligned for optimal performance */
32    return max & ~(SI_CPDMA_ALIGNMENT - 1);
33 }
34 
35 /* should cp dma skip the hole in sparse bo */
cp_dma_sparse_wa(struct si_context * sctx,struct si_resource * sdst)36 static inline bool cp_dma_sparse_wa(struct si_context *sctx, struct si_resource *sdst)
37 {
38    if ((sctx->gfx_level == GFX9) && sdst && (sdst->flags & RADEON_FLAG_SPARSE))
39       return true;
40 
41    return false;
42 }
43 
44 /* Emit a CP DMA packet to do a copy from one buffer to another, or to clear
45  * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit
46  * clear value.
47  */
si_emit_cp_dma(struct si_context * sctx,struct radeon_cmdbuf * cs,uint64_t dst_va,uint64_t src_va,unsigned size,unsigned flags,enum si_cache_policy cache_policy)48 static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, uint64_t dst_va,
49                            uint64_t src_va, unsigned size, unsigned flags,
50                            enum si_cache_policy cache_policy)
51 {
52    uint32_t header = 0, command = 0;
53 
54    assert(size <= cp_dma_max_byte_count(sctx));
55    assert(sctx->gfx_level != GFX6 || cache_policy == L2_BYPASS);
56 
57    if (sctx->gfx_level >= GFX9)
58       command |= S_415_BYTE_COUNT_GFX9(size);
59    else
60       command |= S_415_BYTE_COUNT_GFX6(size);
61 
62    /* Sync flags. */
63    if (flags & CP_DMA_SYNC)
64       header |= S_411_CP_SYNC(1);
65 
66    if (flags & CP_DMA_RAW_WAIT)
67       command |= S_415_RAW_WAIT(1);
68 
69    /* Src and dst flags. */
70    if (sctx->gfx_level >= GFX9 && !(flags & CP_DMA_CLEAR) && src_va == dst_va) {
71       header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
72    } else if (flags & CP_DMA_DST_IS_GDS) {
73       header |= S_411_DST_SEL(V_411_GDS);
74       /* GDS increments the address, not CP. */
75       command |= S_415_DAS(V_415_REGISTER) | S_415_DAIC(V_415_NO_INCREMENT);
76    } else if (sctx->gfx_level >= GFX7 && cache_policy != L2_BYPASS) {
77       header |=
78          S_501_DST_SEL(V_501_DST_ADDR_TC_L2) | S_501_DST_CACHE_POLICY(cache_policy == L2_STREAM);
79    }
80 
81    if (flags & CP_DMA_CLEAR) {
82       header |= S_411_SRC_SEL(V_411_DATA);
83    } else if (flags & CP_DMA_SRC_IS_GDS) {
84       header |= S_411_SRC_SEL(V_411_GDS);
85       /* Both of these are required for GDS. It does increment the address. */
86       command |= S_415_SAS(V_415_REGISTER) | S_415_SAIC(V_415_NO_INCREMENT);
87    } else if (sctx->gfx_level >= GFX7 && cache_policy != L2_BYPASS) {
88       header |=
89          S_501_SRC_SEL(V_501_SRC_ADDR_TC_L2) | S_501_SRC_CACHE_POLICY(cache_policy == L2_STREAM);
90    }
91 
92    radeon_begin(cs);
93 
94    if (sctx->gfx_level >= GFX7) {
95       radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
96       radeon_emit(header);
97       radeon_emit(src_va);       /* SRC_ADDR_LO [31:0] */
98       radeon_emit(src_va >> 32); /* SRC_ADDR_HI [31:0] */
99       radeon_emit(dst_va);       /* DST_ADDR_LO [31:0] */
100       radeon_emit(dst_va >> 32); /* DST_ADDR_HI [31:0] */
101       radeon_emit(command);
102    } else {
103       header |= S_411_SRC_ADDR_HI(src_va >> 32);
104 
105       radeon_emit(PKT3(PKT3_CP_DMA, 4, 0));
106       radeon_emit(src_va);                  /* SRC_ADDR_LO [31:0] */
107       radeon_emit(header);                  /* SRC_ADDR_HI [15:0] + flags. */
108       radeon_emit(dst_va);                  /* DST_ADDR_LO [31:0] */
109       radeon_emit((dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
110       radeon_emit(command);
111    }
112 
113    /* CP DMA is executed in ME, but index buffers are read by PFP.
114     * This ensures that ME (CP DMA) is idle before PFP starts fetching
115     * indices. If we wanted to execute CP DMA in PFP, this packet
116     * should precede it.
117     */
118    if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) {
119       radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
120       radeon_emit(0);
121    }
122    radeon_end();
123 }
124 
si_cp_dma_wait_for_idle(struct si_context * sctx,struct radeon_cmdbuf * cs)125 void si_cp_dma_wait_for_idle(struct si_context *sctx, struct radeon_cmdbuf *cs)
126 {
127    /* Issue a dummy DMA that copies zero bytes.
128     *
129     * The DMA engine will see that there's no work to do and skip this
130     * DMA request, however, the CP will see the sync flag and still wait
131     * for all DMAs to complete.
132     */
133    si_emit_cp_dma(sctx, cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
134 }
135 
si_cp_dma_prepare(struct si_context * sctx,struct pipe_resource * dst,struct pipe_resource * src,unsigned byte_count,uint64_t remaining_size,unsigned user_flags,enum si_coherency coher,bool * is_first,unsigned * packet_flags)136 static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
137                               struct pipe_resource *src, unsigned byte_count,
138                               uint64_t remaining_size, unsigned user_flags, enum si_coherency coher,
139                               bool *is_first, unsigned *packet_flags)
140 {
141    if (!(user_flags & SI_OP_CPDMA_SKIP_CHECK_CS_SPACE))
142       si_need_gfx_cs_space(sctx, 0);
143 
144    /* This must be done after need_cs_space. */
145    if (dst)
146       radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(dst),
147                                 RADEON_USAGE_WRITE | RADEON_PRIO_CP_DMA);
148    if (src)
149       radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(src),
150                                 RADEON_USAGE_READ | RADEON_PRIO_CP_DMA);
151 
152    /* Flush the caches for the first copy only.
153     * Also wait for the previous CP DMA operations.
154     */
155    if (*is_first && sctx->flags)
156       si_emit_cache_flush_direct(sctx);
157 
158    if (user_flags & SI_OP_SYNC_CPDMA_BEFORE && *is_first && !(*packet_flags & CP_DMA_CLEAR))
159       *packet_flags |= CP_DMA_RAW_WAIT;
160 
161    *is_first = false;
162 
163    /* Do the synchronization after the last dma, so that all data
164     * is written to memory.
165     */
166    if (user_flags & SI_OP_SYNC_AFTER && byte_count == remaining_size) {
167       *packet_flags |= CP_DMA_SYNC;
168 
169       if (coher == SI_COHERENCY_SHADER)
170          *packet_flags |= CP_DMA_PFP_SYNC_ME;
171    }
172 }
173 
si_cp_dma_clear_buffer(struct si_context * sctx,struct radeon_cmdbuf * cs,struct pipe_resource * dst,uint64_t offset,uint64_t size,unsigned value,unsigned user_flags,enum si_coherency coher,enum si_cache_policy cache_policy)174 void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
175                             struct pipe_resource *dst, uint64_t offset, uint64_t size,
176                             unsigned value, unsigned user_flags, enum si_coherency coher,
177                             enum si_cache_policy cache_policy)
178 {
179    struct si_resource *sdst = si_resource(dst);
180    uint64_t va = (sdst ? sdst->gpu_address : 0) + offset;
181    bool is_first = true;
182 
183    assert(size && size % 4 == 0);
184 
185    if (user_flags & SI_OP_SYNC_GE_BEFORE)
186       sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
187 
188    if (user_flags & SI_OP_SYNC_CS_BEFORE)
189       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
190 
191    if (user_flags & SI_OP_SYNC_PS_BEFORE)
192       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
193 
194    /* Mark the buffer range of destination as valid (initialized),
195     * so that transfer_map knows it should wait for the GPU when mapping
196     * that range. */
197    if (sdst) {
198       util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
199 
200       if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
201          sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
202    }
203 
204    if (sctx->flags)
205       si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
206 
207    while (size) {
208       unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
209       unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
210 
211       if (cp_dma_sparse_wa(sctx,sdst)) {
212          unsigned skip_count =
213             sctx->ws->buffer_find_next_committed_memory(sdst->buf,
214                   va - sdst->gpu_address, &byte_count);
215          va += skip_count;
216          size -= skip_count;
217       }
218 
219       if (!byte_count)
220          continue;
221 
222       si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags, coher, &is_first,
223                         &dma_flags);
224 
225       /* Emit the clear packet. */
226       si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, cache_policy);
227 
228       size -= byte_count;
229       va += byte_count;
230    }
231 
232    if (sdst && cache_policy != L2_BYPASS)
233       sdst->TC_L2_dirty = true;
234 
235    /* If it's not a framebuffer fast clear... */
236    if (coher == SI_COHERENCY_SHADER)
237       sctx->num_cp_dma_calls++;
238 }
239 
240 /**
241  * Realign the CP DMA engine. This must be done after a copy with an unaligned
242  * size.
243  *
244  * \param size  Remaining size to the CP DMA alignment.
245  */
si_cp_dma_realign_engine(struct si_context * sctx,unsigned size,unsigned user_flags,enum si_coherency coher,enum si_cache_policy cache_policy,bool * is_first)246 static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, unsigned user_flags,
247                                      enum si_coherency coher, enum si_cache_policy cache_policy,
248                                      bool *is_first)
249 {
250    uint64_t va;
251    unsigned dma_flags = 0;
252    unsigned scratch_size = SI_CPDMA_ALIGNMENT * 2;
253 
254    assert(size < SI_CPDMA_ALIGNMENT);
255 
256    /* Use the scratch buffer as the dummy buffer. The 3D engine should be
257     * idle at this point.
258     */
259    if (!sctx->scratch_buffer || sctx->scratch_buffer->b.b.width0 < scratch_size) {
260       si_resource_reference(&sctx->scratch_buffer, NULL);
261       sctx->scratch_buffer = si_aligned_buffer_create(&sctx->screen->b,
262                                                       PIPE_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL |
263                                                       SI_RESOURCE_FLAG_DISCARDABLE,
264                                                       PIPE_USAGE_DEFAULT, scratch_size, 256);
265       if (!sctx->scratch_buffer)
266          return;
267 
268       si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
269    }
270 
271    si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, &sctx->scratch_buffer->b.b, size, size,
272                      user_flags, coher, is_first, &dma_flags);
273 
274    va = sctx->scratch_buffer->gpu_address;
275    si_emit_cp_dma(sctx, &sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags, cache_policy);
276 }
277 
278 /**
279  * Do memcpy between buffers using CP DMA.
280  * If src or dst is NULL, it means read or write GDS, respectively.
281  *
282  * \param user_flags    bitmask of SI_CPDMA_*
283  */
si_cp_dma_copy_buffer(struct si_context * sctx,struct pipe_resource * dst,struct pipe_resource * src,uint64_t dst_offset,uint64_t src_offset,unsigned size,unsigned user_flags,enum si_coherency coher,enum si_cache_policy cache_policy)284 void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
285                            struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
286                            unsigned size, unsigned user_flags, enum si_coherency coher,
287                            enum si_cache_policy cache_policy)
288 {
289    uint64_t main_dst_offset, main_src_offset;
290    unsigned skipped_size = 0;
291    unsigned realign_size = 0;
292    unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) | (src ? 0 : CP_DMA_SRC_IS_GDS);
293    bool is_first = true;
294 
295    assert(size);
296 
297    if (dst) {
298       /* Skip this for the L2 prefetch. */
299       if (dst != src || dst_offset != src_offset) {
300          /* Mark the buffer range of destination as valid (initialized),
301           * so that transfer_map knows it should wait for the GPU when mapping
302           * that range. */
303          util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size);
304       }
305 
306       dst_offset += si_resource(dst)->gpu_address;
307    }
308    if (src)
309       src_offset += si_resource(src)->gpu_address;
310 
311    /* The workarounds aren't needed on Fiji and beyond. */
312    if (sctx->family <= CHIP_CARRIZO || sctx->family == CHIP_STONEY) {
313       /* If the size is not aligned, we must add a dummy copy at the end
314        * just to align the internal counter. Otherwise, the DMA engine
315        * would slow down by an order of magnitude for following copies.
316        */
317       if (size % SI_CPDMA_ALIGNMENT)
318          realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT);
319 
320       /* If the copy begins unaligned, we must start copying from the next
321        * aligned block and the skipped part should be copied after everything
322        * else has been copied. Only the src alignment matters, not dst.
323        *
324        * GDS doesn't need the source address to be aligned.
325        */
326       if (src && src_offset % SI_CPDMA_ALIGNMENT) {
327          skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT);
328          /* The main part will be skipped if the size is too small. */
329          skipped_size = MIN2(skipped_size, size);
330          size -= skipped_size;
331       }
332    }
333 
334    /* TMZ handling */
335    if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
336       bool secure = src && (si_resource(src)->flags & RADEON_FLAG_ENCRYPTED);
337       assert(!secure || (!dst || (si_resource(dst)->flags & RADEON_FLAG_ENCRYPTED)));
338       if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) {
339          si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW |
340                                RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL);
341       }
342    }
343 
344    if (user_flags & SI_OP_SYNC_GE_BEFORE)
345       sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
346 
347    if (user_flags & SI_OP_SYNC_CS_BEFORE)
348       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
349 
350    if (user_flags & SI_OP_SYNC_PS_BEFORE)
351       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
352 
353    if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
354          sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
355 
356    if (sctx->flags)
357       si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
358 
359    /* This is the main part doing the copying. Src is always aligned. */
360    main_dst_offset = dst_offset + skipped_size;
361    main_src_offset = src_offset + skipped_size;
362 
363    while (size) {
364       unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
365       unsigned dma_flags = gds_flags;
366 
367       if (cp_dma_sparse_wa(sctx, si_resource(dst))) {
368          unsigned skip_count =
369             sctx->ws->buffer_find_next_committed_memory(si_resource(dst)->buf,
370                   main_dst_offset - si_resource(dst)->gpu_address, &byte_count);
371          main_dst_offset += skip_count;
372          main_src_offset += skip_count;
373          size -= skip_count;
374       }
375 
376       if (cp_dma_sparse_wa(sctx, si_resource(src))) {
377          unsigned skip_count =
378             sctx->ws->buffer_find_next_committed_memory(si_resource(src)->buf,
379                   main_src_offset - si_resource(src)->gpu_address, &byte_count);
380          main_dst_offset += skip_count;
381          main_src_offset += skip_count;
382          size -= skip_count;
383       }
384 
385       if (!byte_count)
386          continue;
387 
388       si_cp_dma_prepare(sctx, dst, src, byte_count, size + skipped_size + realign_size, user_flags,
389                         coher, &is_first, &dma_flags);
390 
391       si_emit_cp_dma(sctx, &sctx->gfx_cs, main_dst_offset, main_src_offset, byte_count, dma_flags,
392                      cache_policy);
393 
394       size -= byte_count;
395       main_src_offset += byte_count;
396       main_dst_offset += byte_count;
397    }
398 
399    /* Copy the part we skipped because src wasn't aligned. */
400    if (skipped_size) {
401       unsigned dma_flags = gds_flags;
402 
403       si_cp_dma_prepare(sctx, dst, src, skipped_size, skipped_size + realign_size, user_flags,
404                         coher, &is_first, &dma_flags);
405 
406       si_emit_cp_dma(sctx, &sctx->gfx_cs, dst_offset, src_offset, skipped_size, dma_flags,
407                      cache_policy);
408    }
409 
410    /* Finally, realign the engine if the size wasn't aligned. */
411    if (realign_size) {
412       si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher, cache_policy, &is_first);
413    }
414 
415    if (dst && cache_policy != L2_BYPASS)
416       si_resource(dst)->TC_L2_dirty = true;
417 
418    /* If it's not a prefetch or GDS copy... */
419    if (dst && src && (dst != src || dst_offset != src_offset))
420       sctx->num_cp_dma_calls++;
421 }
422 
si_test_gds(struct si_context * sctx)423 void si_test_gds(struct si_context *sctx)
424 {
425    struct pipe_context *ctx = &sctx->b;
426    struct pipe_resource *src, *dst;
427    unsigned r[4] = {};
428    unsigned offset = debug_get_num_option("OFFSET", 16);
429 
430    src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
431    dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
432    si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, src, 0, 4, 0xabcdef01, SI_OP_SYNC_BEFORE_AFTER,
433                           SI_COHERENCY_SHADER, L2_BYPASS);
434    si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, src, 4, 4, 0x23456789, SI_OP_SYNC_BEFORE_AFTER,
435                           SI_COHERENCY_SHADER, L2_BYPASS);
436    si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, src, 8, 4, 0x87654321, SI_OP_SYNC_BEFORE_AFTER,
437                           SI_COHERENCY_SHADER, L2_BYPASS);
438    si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, src, 12, 4, 0xfedcba98, SI_OP_SYNC_BEFORE_AFTER,
439                           SI_COHERENCY_SHADER, L2_BYPASS);
440    si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, SI_OP_SYNC_BEFORE_AFTER,
441                           SI_COHERENCY_SHADER, L2_BYPASS);
442 
443    si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, SI_OP_SYNC_BEFORE_AFTER,
444                          SI_COHERENCY_NONE, L2_BYPASS);
445    si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, SI_OP_SYNC_BEFORE_AFTER,
446                          SI_COHERENCY_NONE, L2_BYPASS);
447 
448    pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
449    printf("GDS copy  = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
450           r[0] == 0xabcdef01 && r[1] == 0x23456789 && r[2] == 0x87654321 && r[3] == 0xfedcba98
451              ? "pass"
452              : "fail");
453 
454    si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, NULL, offset, 16, 0xc1ea4146,
455                           SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_NONE, L2_BYPASS);
456    si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, SI_OP_SYNC_BEFORE_AFTER,
457                          SI_COHERENCY_NONE, L2_BYPASS);
458 
459    pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
460    printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
461           r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 && r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146
462              ? "pass"
463              : "fail");
464 
465    pipe_resource_reference(&src, NULL);
466    pipe_resource_reference(&dst, NULL);
467    exit(0);
468 }
469 
si_cp_write_data(struct si_context * sctx,struct si_resource * buf,unsigned offset,unsigned size,unsigned dst_sel,unsigned engine,const void * data)470 void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset,
471                       unsigned size, unsigned dst_sel, unsigned engine, const void *data)
472 {
473    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
474 
475    assert(offset % 4 == 0);
476    assert(size % 4 == 0);
477 
478    if (sctx->gfx_level == GFX6 && dst_sel == V_370_MEM)
479       dst_sel = V_370_MEM_GRBM;
480 
481    radeon_add_to_buffer_list(sctx, cs, buf, RADEON_USAGE_WRITE | RADEON_PRIO_CP_DMA);
482    uint64_t va = buf->gpu_address + offset;
483 
484    radeon_begin(cs);
485    radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
486    radeon_emit(S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
487    radeon_emit(va);
488    radeon_emit(va >> 32);
489    radeon_emit_array((const uint32_t *)data, size / 4);
490    radeon_end();
491 }
492 
si_cp_copy_data(struct si_context * sctx,struct radeon_cmdbuf * cs,unsigned dst_sel,struct si_resource * dst,unsigned dst_offset,unsigned src_sel,struct si_resource * src,unsigned src_offset)493 void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel,
494                      struct si_resource *dst, unsigned dst_offset, unsigned src_sel,
495                      struct si_resource *src, unsigned src_offset)
496 {
497    /* cs can point to the compute IB, which has the buffer list in gfx_cs. */
498    if (dst) {
499       radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, dst, RADEON_USAGE_WRITE | RADEON_PRIO_CP_DMA);
500    }
501    if (src) {
502       radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, src, RADEON_USAGE_READ | RADEON_PRIO_CP_DMA);
503    }
504 
505    uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset;
506    uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
507 
508    radeon_begin(cs);
509    radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
510    radeon_emit(COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
511    radeon_emit(src_va);
512    radeon_emit(src_va >> 32);
513    radeon_emit(dst_va);
514    radeon_emit(dst_va >> 32);
515    radeon_end();
516 }
517