• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * Authors:
24  *      Marek Olšák <maraeo@gmail.com>
25  */
26 
27 #include "si_pipe.h"
28 #include "sid.h"
29 #include "radeon/r600_cs.h"
30 
31 /* Alignment for optimal performance. */
32 #define CP_DMA_ALIGNMENT	32
33 /* The max number of bytes to copy per packet. */
34 #define CP_DMA_MAX_BYTE_COUNT	((1 << 21) - CP_DMA_ALIGNMENT)
35 
36 /* Set this if you want the ME to wait until CP DMA is done.
37  * It should be set on the last CP DMA packet. */
38 #define CP_DMA_SYNC		(1 << 0)
39 
40 /* Set this if the source data was used as a destination in a previous CP DMA
41  * packet. It's for preventing a read-after-write (RAW) hazard between two
42  * CP DMA packets. */
43 #define CP_DMA_RAW_WAIT		(1 << 1)
44 #define CP_DMA_USE_L2		(1 << 2) /* CIK+ */
45 #define CP_DMA_CLEAR		(1 << 3)
46 
47 /* Emit a CP DMA packet to do a copy from one buffer to another, or to clear
48  * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit
49  * clear value.
50  */
si_emit_cp_dma(struct si_context * sctx,uint64_t dst_va,uint64_t src_va,unsigned size,unsigned flags,enum r600_coherency coher)51 static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va,
52 			   uint64_t src_va, unsigned size, unsigned flags,
53 			   enum r600_coherency coher)
54 {
55 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
56 	uint32_t header = 0, command = S_414_BYTE_COUNT(size);
57 
58 	assert(size);
59 	assert(size <= CP_DMA_MAX_BYTE_COUNT);
60 
61 	/* Sync flags. */
62 	if (flags & CP_DMA_SYNC)
63 		header |= S_411_CP_SYNC(1);
64 	else
65 		command |= S_414_DISABLE_WR_CONFIRM(1);
66 
67 	if (flags & CP_DMA_RAW_WAIT)
68 		command |= S_414_RAW_WAIT(1);
69 
70 	/* Src and dst flags. */
71 	if (flags & CP_DMA_USE_L2)
72 		header |= S_411_DSL_SEL(V_411_DST_ADDR_TC_L2);
73 
74 	if (flags & CP_DMA_CLEAR)
75 		header |= S_411_SRC_SEL(V_411_DATA);
76 	else if (flags & CP_DMA_USE_L2)
77 		header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2);
78 
79 	if (sctx->b.chip_class >= CIK) {
80 		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
81 		radeon_emit(cs, header);
82 		radeon_emit(cs, src_va);	/* SRC_ADDR_LO [31:0] */
83 		radeon_emit(cs, src_va >> 32);	/* SRC_ADDR_HI [31:0] */
84 		radeon_emit(cs, dst_va);	/* DST_ADDR_LO [31:0] */
85 		radeon_emit(cs, dst_va >> 32);	/* DST_ADDR_HI [31:0] */
86 		radeon_emit(cs, command);
87 	} else {
88 		header |= S_411_SRC_ADDR_HI(src_va >> 32);
89 
90 		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
91 		radeon_emit(cs, src_va);	/* SRC_ADDR_LO [31:0] */
92 		radeon_emit(cs, header);	/* SRC_ADDR_HI [15:0] + flags. */
93 		radeon_emit(cs, dst_va);	/* DST_ADDR_LO [31:0] */
94 		radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
95 		radeon_emit(cs, command);
96 	}
97 
98 	/* CP DMA is executed in ME, but index buffers are read by PFP.
99 	 * This ensures that ME (CP DMA) is idle before PFP starts fetching
100 	 * indices. If we wanted to execute CP DMA in PFP, this packet
101 	 * should precede it.
102 	 */
103 	if (coher == R600_COHERENCY_SHADER && flags & CP_DMA_SYNC) {
104 		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
105 		radeon_emit(cs, 0);
106 	}
107 }
108 
get_flush_flags(struct si_context * sctx,enum r600_coherency coher)109 static unsigned get_flush_flags(struct si_context *sctx, enum r600_coherency coher)
110 {
111 	switch (coher) {
112 	default:
113 	case R600_COHERENCY_NONE:
114 		return 0;
115 	case R600_COHERENCY_SHADER:
116 		return SI_CONTEXT_INV_SMEM_L1 |
117 		       SI_CONTEXT_INV_VMEM_L1 |
118 		       (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
119 	case R600_COHERENCY_CB_META:
120 		return SI_CONTEXT_FLUSH_AND_INV_CB |
121 		       SI_CONTEXT_FLUSH_AND_INV_CB_META;
122 	}
123 }
124 
get_tc_l2_flag(struct si_context * sctx,enum r600_coherency coher)125 static unsigned get_tc_l2_flag(struct si_context *sctx, enum r600_coherency coher)
126 {
127 	return coher == R600_COHERENCY_SHADER &&
128 	       sctx->b.chip_class >= CIK ? CP_DMA_USE_L2 : 0;
129 }
130 
si_cp_dma_prepare(struct si_context * sctx,struct pipe_resource * dst,struct pipe_resource * src,unsigned byte_count,uint64_t remaining_size,unsigned user_flags,bool * is_first,unsigned * packet_flags)131 static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
132 			      struct pipe_resource *src, unsigned byte_count,
133 			      uint64_t remaining_size, unsigned user_flags,
134 			      bool *is_first, unsigned *packet_flags)
135 {
136 	/* Fast exit for a CPDMA prefetch. */
137 	if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
138 		*is_first = false;
139 		return;
140 	}
141 
142 	if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
143 		/* Count memory usage in so that need_cs_space can take it into account. */
144 		r600_context_add_resource_size(&sctx->b.b, dst);
145 		if (src)
146 			r600_context_add_resource_size(&sctx->b.b, src);
147 	}
148 
149 	if (!(user_flags & SI_CPDMA_SKIP_CHECK_CS_SPACE))
150 		si_need_cs_space(sctx);
151 
152 	/* This must be done after need_cs_space. */
153 	if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
154 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
155 					  (struct r600_resource*)dst,
156 					  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
157 		if (src)
158 			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
159 						  (struct r600_resource*)src,
160 						  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
161 	}
162 
163 	/* Flush the caches for the first copy only.
164 	 * Also wait for the previous CP DMA operations.
165 	 */
166 	if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->b.flags)
167 		si_emit_cache_flush(sctx);
168 
169 	if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first)
170 		*packet_flags |= CP_DMA_RAW_WAIT;
171 
172 	*is_first = false;
173 
174 	/* Do the synchronization after the last dma, so that all data
175 	 * is written to memory.
176 	 */
177 	if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
178 	    byte_count == remaining_size)
179 		*packet_flags |= CP_DMA_SYNC;
180 }
181 
si_clear_buffer(struct pipe_context * ctx,struct pipe_resource * dst,uint64_t offset,uint64_t size,unsigned value,enum r600_coherency coher)182 static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
183 			    uint64_t offset, uint64_t size, unsigned value,
184 			    enum r600_coherency coher)
185 {
186 	struct si_context *sctx = (struct si_context*)ctx;
187 	struct radeon_winsys *ws = sctx->b.ws;
188 	struct r600_resource *rdst = r600_resource(dst);
189 	unsigned tc_l2_flag = get_tc_l2_flag(sctx, coher);
190 	unsigned flush_flags = get_flush_flags(sctx, coher);
191 	bool is_first = true;
192 
193 	if (!size)
194 		return;
195 
196 	/* Mark the buffer range of destination as valid (initialized),
197 	 * so that transfer_map knows it should wait for the GPU when mapping
198 	 * that range. */
199 	util_range_add(&rdst->valid_buffer_range, offset,
200 		       offset + size);
201 
202 	/* Fallback for unaligned clears. */
203 	if (offset % 4 != 0 || size % 4 != 0) {
204 		uint8_t *map = r600_buffer_map_sync_with_rings(&sctx->b, rdst,
205 							       PIPE_TRANSFER_WRITE);
206 		map += offset;
207 		for (uint64_t i = 0; i < size; i++) {
208 			unsigned byte_within_dword = (offset + i) % 4;
209 			*map++ = (value >> (byte_within_dword * 8)) & 0xff;
210 		}
211 		return;
212 	}
213 
214 	/* dma_clear_buffer can use clear_buffer on failure. Make sure that
215 	 * doesn't happen. We don't want an infinite recursion: */
216 	if (sctx->b.dma.cs &&
217 	    /* CP DMA is very slow. Always use SDMA for big clears. This
218 	     * alone improves DeusEx:MD performance by 70%. */
219 	    (size > 128 * 1024 ||
220 	     /* Buffers not used by the GFX IB yet will be cleared by SDMA.
221 	      * This happens to move most buffer clears to SDMA, including
222 	      * DCC and CMASK clears, because pipe->clear clears them before
223 	      * si_emit_framebuffer_state (in a draw call) adds them.
224 	      * For example, DeusEx:MD has 21 buffer clears per frame and all
225 	      * of them are moved to SDMA thanks to this. */
226 	     !ws->cs_is_buffer_referenced(sctx->b.gfx.cs, rdst->buf,
227 				          RADEON_USAGE_READWRITE))) {
228 		sctx->b.dma_clear_buffer(ctx, dst, offset, size, value);
229 		return;
230 	}
231 
232 	uint64_t va = rdst->gpu_address + offset;
233 
234 	/* Flush the caches. */
235 	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
236 	                 SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
237 
238 	while (size) {
239 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
240 		unsigned dma_flags = tc_l2_flag  | CP_DMA_CLEAR;
241 
242 		si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0,
243 				  &is_first, &dma_flags);
244 
245 		/* Emit the clear packet. */
246 		si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, coher);
247 
248 		size -= byte_count;
249 		va += byte_count;
250 	}
251 
252 	if (tc_l2_flag)
253 		rdst->TC_L2_dirty = true;
254 
255 	/* If it's not a framebuffer fast clear... */
256 	if (coher == R600_COHERENCY_SHADER)
257 		sctx->b.num_cp_dma_calls++;
258 }
259 
260 /**
261  * Realign the CP DMA engine. This must be done after a copy with an unaligned
262  * size.
263  *
264  * \param size  Remaining size to the CP DMA alignment.
265  */
si_cp_dma_realign_engine(struct si_context * sctx,unsigned size,unsigned user_flags,bool * is_first)266 static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
267 				     unsigned user_flags, bool *is_first)
268 {
269 	uint64_t va;
270 	unsigned dma_flags = 0;
271 	unsigned scratch_size = CP_DMA_ALIGNMENT * 2;
272 
273 	assert(size < CP_DMA_ALIGNMENT);
274 
275 	/* Use the scratch buffer as the dummy buffer. The 3D engine should be
276 	 * idle at this point.
277 	 */
278 	if (!sctx->scratch_buffer ||
279 	    sctx->scratch_buffer->b.b.width0 < scratch_size) {
280 		r600_resource_reference(&sctx->scratch_buffer, NULL);
281 		sctx->scratch_buffer = (struct r600_resource*)
282 			pipe_buffer_create(&sctx->screen->b.b, 0,
283 					   PIPE_USAGE_DEFAULT, scratch_size);
284 		if (!sctx->scratch_buffer)
285 			return;
286 		sctx->emit_scratch_reloc = true;
287 	}
288 
289 	si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
290 			  &sctx->scratch_buffer->b.b, size, size, user_flags,
291 			  is_first, &dma_flags);
292 
293 	va = sctx->scratch_buffer->gpu_address;
294 	si_emit_cp_dma(sctx, va, va + CP_DMA_ALIGNMENT, size, dma_flags,
295 		       R600_COHERENCY_SHADER);
296 }
297 
298 /**
299  * Do memcpy between buffers using CP DMA.
300  *
301  * \param user_flags	bitmask of SI_CPDMA_*
302  */
si_copy_buffer(struct si_context * sctx,struct pipe_resource * dst,struct pipe_resource * src,uint64_t dst_offset,uint64_t src_offset,unsigned size,unsigned user_flags)303 void si_copy_buffer(struct si_context *sctx,
304 		    struct pipe_resource *dst, struct pipe_resource *src,
305 		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
306 		    unsigned user_flags)
307 {
308 	uint64_t main_dst_offset, main_src_offset;
309 	unsigned skipped_size = 0;
310 	unsigned realign_size = 0;
311 	unsigned tc_l2_flag = get_tc_l2_flag(sctx, R600_COHERENCY_SHADER);
312 	unsigned flush_flags = get_flush_flags(sctx, R600_COHERENCY_SHADER);
313 	bool is_first = true;
314 
315 	if (!size)
316 		return;
317 
318 	if (dst != src || dst_offset != src_offset) {
319 		/* Mark the buffer range of destination as valid (initialized),
320 		 * so that transfer_map knows it should wait for the GPU when mapping
321 		 * that range. */
322 		util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
323 			       dst_offset + size);
324 	}
325 
326 	dst_offset += r600_resource(dst)->gpu_address;
327 	src_offset += r600_resource(src)->gpu_address;
328 
329 	/* The workarounds aren't needed on Fiji and beyond. */
330 	if (sctx->b.family <= CHIP_CARRIZO ||
331 	    sctx->b.family == CHIP_STONEY) {
332 		/* If the size is not aligned, we must add a dummy copy at the end
333 		 * just to align the internal counter. Otherwise, the DMA engine
334 		 * would slow down by an order of magnitude for following copies.
335 		 */
336 		if (size % CP_DMA_ALIGNMENT)
337 			realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
338 
339 		/* If the copy begins unaligned, we must start copying from the next
340 		 * aligned block and the skipped part should be copied after everything
341 		 * else has been copied. Only the src alignment matters, not dst.
342 		 */
343 		if (src_offset % CP_DMA_ALIGNMENT) {
344 			skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
345 			/* The main part will be skipped if the size is too small. */
346 			skipped_size = MIN2(skipped_size, size);
347 			size -= skipped_size;
348 		}
349 	}
350 
351 	/* Flush the caches. */
352 	if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC))
353 		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
354 				 SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
355 
356 	/* This is the main part doing the copying. Src is always aligned. */
357 	main_dst_offset = dst_offset + skipped_size;
358 	main_src_offset = src_offset + skipped_size;
359 
360 	while (size) {
361 		unsigned dma_flags = tc_l2_flag;
362 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
363 
364 		si_cp_dma_prepare(sctx, dst, src, byte_count,
365 				  size + skipped_size + realign_size,
366 				  user_flags, &is_first, &dma_flags);
367 
368 		si_emit_cp_dma(sctx, main_dst_offset, main_src_offset,
369 			       byte_count, dma_flags, R600_COHERENCY_SHADER);
370 
371 		size -= byte_count;
372 		main_src_offset += byte_count;
373 		main_dst_offset += byte_count;
374 	}
375 
376 	/* Copy the part we skipped because src wasn't aligned. */
377 	if (skipped_size) {
378 		unsigned dma_flags = tc_l2_flag;
379 
380 		si_cp_dma_prepare(sctx, dst, src, skipped_size,
381 				  skipped_size + realign_size, user_flags,
382 				  &is_first, &dma_flags);
383 
384 		si_emit_cp_dma(sctx, dst_offset, src_offset, skipped_size,
385 			       dma_flags, R600_COHERENCY_SHADER);
386 	}
387 
388 	/* Finally, realign the engine if the size wasn't aligned. */
389 	if (realign_size)
390 		si_cp_dma_realign_engine(sctx, realign_size, user_flags,
391 					 &is_first);
392 
393 	if (tc_l2_flag)
394 		r600_resource(dst)->TC_L2_dirty = true;
395 
396 	/* If it's not a prefetch... */
397 	if (dst_offset != src_offset)
398 		sctx->b.num_cp_dma_calls++;
399 }
400 
cik_prefetch_TC_L2_async(struct si_context * sctx,struct pipe_resource * buf,uint64_t offset,unsigned size)401 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
402 			      uint64_t offset, unsigned size)
403 {
404 	assert(sctx->b.chip_class >= CIK);
405 
406 	si_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL);
407 }
408 
si_init_cp_dma_functions(struct si_context * sctx)409 void si_init_cp_dma_functions(struct si_context *sctx)
410 {
411 	sctx->b.clear_buffer = si_clear_buffer;
412 }
413