1 /*
2 * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #ifndef FREEDRENO_RINGBUFFER_H_
28 #define FREEDRENO_RINGBUFFER_H_
29
30 #include <stdio.h>
31 #include "util/u_atomic.h"
32 #include "util/u_debug.h"
33 #include "util/u_queue.h"
34
35 #include "adreno_common.xml.h"
36 #include "adreno_pm4.xml.h"
37 #include "freedreno_drmif.h"
38 #include "freedreno_pm4.h"
39
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43
44 struct fd_submit;
45 struct fd_ringbuffer;
46
47 enum fd_ringbuffer_flags {
48
49 /* Primary ringbuffer for a submit, ie. an IB1 level rb
50 * which kernel must setup RB->IB1 CP_INDIRECT_BRANCH
51 * packets.
52 */
53 FD_RINGBUFFER_PRIMARY = 0x1,
54
55 /* Hint that the stateobj will be used for streaming state
56 * that is used once or a few times and then discarded.
57 *
58 * For sub-allocation, non streaming stateobj's should be
59 * sub-allocated from a page size buffer, so one long lived
60 * state obj doesn't prevent other pages from being freed.
61 * (Ie. it would be no worse than allocating a page sized
62 * bo for each small non-streaming stateobj).
63 *
64 * But streaming stateobj's could be sub-allocated from a
65 * larger buffer to reduce the alloc/del overhead.
66 */
67 FD_RINGBUFFER_STREAMING = 0x2,
68
69 /* Indicates that "growable" cmdstream can be used,
70 * consisting of multiple physical cmdstream buffers
71 */
72 FD_RINGBUFFER_GROWABLE = 0x4,
73
74 /* Internal use only: */
75 _FD_RINGBUFFER_OBJECT = 0x8,
76 };
77
78 /* A submit object manages/tracks all the state buildup for a "submit"
79 * ioctl to the kernel. Additionally, with the exception of long-lived
80 * non-STREAMING stateobj rb's, rb's are allocated from the submit.
81 */
82 struct fd_submit *fd_submit_new(struct fd_pipe *pipe);
83
84 /* NOTE: all ringbuffer's create from the submit should be unref'd
85 * before destroying the submit.
86 */
87 void fd_submit_del(struct fd_submit *submit);
88
89 struct fd_submit * fd_submit_ref(struct fd_submit *submit);
90
91 /* Allocate a new rb from the submit. */
92 struct fd_ringbuffer *fd_submit_new_ringbuffer(struct fd_submit *submit,
93 uint32_t size,
94 enum fd_ringbuffer_flags flags);
95
96 /**
97 * Encapsulates submit out-fence(s), which consist of a 'timestamp' (per-
98 * pipe (submitqueue) sequence number) and optionally, if requested, an
99 * out-fence-fd
100 */
101 struct fd_submit_fence {
102 /**
103 * The ready fence is signaled once the submit is actually flushed down
104 * to the kernel, and fence/fence_fd are populated. You must wait for
105 * this fence to be signaled before reading fence/fence_fd.
106 */
107 struct util_queue_fence ready;
108
109 struct fd_fence fence;
110
111 /**
112 * Optional dma_fence fd, returned by submit if use_fence_fd is true
113 */
114 int fence_fd;
115 bool use_fence_fd;
116 };
117
118 /* in_fence_fd: -1 for no in-fence, else fence fd
119 * out_fence can be NULL if no output fence is required
120 */
121 int fd_submit_flush(struct fd_submit *submit, int in_fence_fd,
122 struct fd_submit_fence *out_fence);
123
124 struct fd_ringbuffer;
125 struct fd_reloc;
126
127 struct fd_ringbuffer_funcs {
128 void (*grow)(struct fd_ringbuffer *ring, uint32_t size);
129 void (*emit_reloc)(struct fd_ringbuffer *ring, const struct fd_reloc *reloc);
130 uint32_t (*emit_reloc_ring)(struct fd_ringbuffer *ring,
131 struct fd_ringbuffer *target, uint32_t cmd_idx);
132 uint32_t (*cmd_count)(struct fd_ringbuffer *ring);
133 bool (*check_size)(struct fd_ringbuffer *ring);
134 void (*destroy)(struct fd_ringbuffer *ring);
135 };
136
137 /* the ringbuffer object is not opaque so that OUT_RING() type stuff
138 * can be inlined. Note that users should not make assumptions about
139 * the size of this struct.
140 */
141 struct fd_ringbuffer {
142 uint32_t *cur, *end, *start;
143 const struct fd_ringbuffer_funcs *funcs;
144
145 // size or end coudl probably go away
146 int size;
147 int32_t refcnt;
148 enum fd_ringbuffer_flags flags;
149 };
150
151 /* Allocate a new long-lived state object, not associated with
152 * a submit:
153 */
154 struct fd_ringbuffer *fd_ringbuffer_new_object(struct fd_pipe *pipe,
155 uint32_t size);
156
157 static inline void
fd_ringbuffer_del(struct fd_ringbuffer * ring)158 fd_ringbuffer_del(struct fd_ringbuffer *ring)
159 {
160 if (!p_atomic_dec_zero(&ring->refcnt))
161 return;
162
163 ring->funcs->destroy(ring);
164 }
165
166 static inline struct fd_ringbuffer *
fd_ringbuffer_ref(struct fd_ringbuffer * ring)167 fd_ringbuffer_ref(struct fd_ringbuffer *ring)
168 {
169 p_atomic_inc(&ring->refcnt);
170 return ring;
171 }
172
173 static inline void
fd_ringbuffer_grow(struct fd_ringbuffer * ring,uint32_t ndwords)174 fd_ringbuffer_grow(struct fd_ringbuffer *ring, uint32_t ndwords)
175 {
176 assert(ring->funcs->grow); /* unsupported on kgsl */
177
178 /* there is an upper bound on IB size, which appears to be 0x0fffff */
179 ring->size = MIN2(ring->size << 1, 0x0fffff);
180
181 ring->funcs->grow(ring, ring->size);
182 }
183
184 static inline bool
fd_ringbuffer_check_size(struct fd_ringbuffer * ring)185 fd_ringbuffer_check_size(struct fd_ringbuffer *ring)
186 {
187 return ring->funcs->check_size(ring);
188 }
189
190 static inline void
fd_ringbuffer_emit(struct fd_ringbuffer * ring,uint32_t data)191 fd_ringbuffer_emit(struct fd_ringbuffer *ring, uint32_t data)
192 {
193 (*ring->cur++) = data;
194 }
195
196 struct fd_reloc {
197 struct fd_bo *bo;
198 uint64_t iova;
199 #define FD_RELOC_READ 0x0001
200 #define FD_RELOC_WRITE 0x0002
201 #define FD_RELOC_DUMP 0x0004
202 uint32_t offset;
203 uint32_t orlo;
204 int32_t shift;
205 uint32_t orhi; /* used for a5xx+ */
206 };
207
208 /* We always mark BOs for write, instead of tracking it across reloc
209 * sources in userspace. On the kernel side, this means we track a single
210 * excl fence in the BO instead of a set of read fences, which is cheaper.
211 * The downside is that a dmabuf-shared device won't be able to read in
212 * parallel with a read-only access by freedreno, but most other drivers
213 * have decided that that usecase isn't important enough to do this
214 * tracking, as well.
215 */
216 #define FD_RELOC_FLAGS_INIT (FD_RELOC_READ | FD_RELOC_WRITE)
217
218 /* NOTE: relocs are 2 dwords on a5xx+ */
219
220 static inline void
fd_ringbuffer_reloc(struct fd_ringbuffer * ring,const struct fd_reloc * reloc)221 fd_ringbuffer_reloc(struct fd_ringbuffer *ring, const struct fd_reloc *reloc)
222 {
223 ring->funcs->emit_reloc(ring, reloc);
224 }
225
226 static inline uint32_t
fd_ringbuffer_cmd_count(struct fd_ringbuffer * ring)227 fd_ringbuffer_cmd_count(struct fd_ringbuffer *ring)
228 {
229 if (!ring->funcs->cmd_count)
230 return 1;
231 return ring->funcs->cmd_count(ring);
232 }
233
234 static inline uint32_t
fd_ringbuffer_emit_reloc_ring_full(struct fd_ringbuffer * ring,struct fd_ringbuffer * target,uint32_t cmd_idx)235 fd_ringbuffer_emit_reloc_ring_full(struct fd_ringbuffer *ring,
236 struct fd_ringbuffer *target,
237 uint32_t cmd_idx)
238 {
239 return ring->funcs->emit_reloc_ring(ring, target, cmd_idx);
240 }
241
242 static inline uint32_t
offset_bytes(void * end,void * start)243 offset_bytes(void *end, void *start)
244 {
245 return ((char *)end) - ((char *)start);
246 }
247
248 static inline uint32_t
fd_ringbuffer_size(struct fd_ringbuffer * ring)249 fd_ringbuffer_size(struct fd_ringbuffer *ring)
250 {
251 /* only really needed for stateobj ringbuffers, and won't really
252 * do what you expect for growable rb's.. so lets just restrict
253 * this to stateobj's for now:
254 */
255 debug_assert(!(ring->flags & FD_RINGBUFFER_GROWABLE));
256 return offset_bytes(ring->cur, ring->start);
257 }
258
259 static inline bool
fd_ringbuffer_empty(struct fd_ringbuffer * ring)260 fd_ringbuffer_empty(struct fd_ringbuffer *ring)
261 {
262 return (fd_ringbuffer_cmd_count(ring) == 1) &&
263 (offset_bytes(ring->cur, ring->start) == 0);
264 }
265
266 #define LOG_DWORDS 0
267
268 static inline void
OUT_RING(struct fd_ringbuffer * ring,uint32_t data)269 OUT_RING(struct fd_ringbuffer *ring, uint32_t data)
270 {
271 if (LOG_DWORDS) {
272 fprintf(stderr, "ring[%p]: OUT_RING %04x: %08x", ring,
273 (uint32_t)(ring->cur - ring->start), data);
274 }
275 fd_ringbuffer_emit(ring, data);
276 }
277
278 /*
279 * NOTE: OUT_RELOC() is 2 dwords (64b) on a5xx+
280 */
281 #ifndef __cplusplus
282 static inline void
OUT_RELOC(struct fd_ringbuffer * ring,struct fd_bo * bo,uint32_t offset,uint64_t or,int32_t shift)283 OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo, uint32_t offset,
284 uint64_t or, int32_t shift)
285 {
286 if (LOG_DWORDS) {
287 fprintf(stderr, "ring[%p]: OUT_RELOC %04x: %p+%u << %d", ring,
288 (uint32_t)(ring->cur - ring->start), bo, offset, shift);
289 }
290 debug_assert(offset < fd_bo_size(bo));
291
292 uint64_t iova = fd_bo_get_iova(bo) + offset;
293
294 if (shift < 0)
295 iova >>= -shift;
296 else
297 iova <<= shift;
298
299 iova |= or ;
300
301 fd_ringbuffer_reloc(ring, &(struct fd_reloc){
302 .bo = bo,
303 .iova = iova,
304 .offset = offset,
305 .orlo = or
306 ,
307 .shift = shift,
308 .orhi = or >> 32,
309 });
310 }
311 #endif
312
313 static inline void
OUT_RB(struct fd_ringbuffer * ring,struct fd_ringbuffer * target)314 OUT_RB(struct fd_ringbuffer *ring, struct fd_ringbuffer *target)
315 {
316 fd_ringbuffer_emit_reloc_ring_full(ring, target, 0);
317 }
318
319 static inline void
BEGIN_RING(struct fd_ringbuffer * ring,uint32_t ndwords)320 BEGIN_RING(struct fd_ringbuffer *ring, uint32_t ndwords)
321 {
322 if (unlikely(ring->cur + ndwords > ring->end))
323 fd_ringbuffer_grow(ring, ndwords);
324 }
325
326 static inline void
OUT_PKT0(struct fd_ringbuffer * ring,uint16_t regindx,uint16_t cnt)327 OUT_PKT0(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
328 {
329 BEGIN_RING(ring, cnt + 1);
330 OUT_RING(ring, pm4_pkt0_hdr(regindx, cnt));
331 }
332
333 static inline void
OUT_PKT2(struct fd_ringbuffer * ring)334 OUT_PKT2(struct fd_ringbuffer *ring)
335 {
336 BEGIN_RING(ring, 1);
337 OUT_RING(ring, CP_TYPE2_PKT);
338 }
339
340 static inline void
OUT_PKT3(struct fd_ringbuffer * ring,uint8_t opcode,uint16_t cnt)341 OUT_PKT3(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
342 {
343 BEGIN_RING(ring, cnt + 1);
344 OUT_RING(ring, CP_TYPE3_PKT | ((cnt - 1) << 16) | ((opcode & 0xFF) << 8));
345 }
346
347 /*
348 * Starting with a5xx, pkt4/pkt7 are used instead of pkt0/pkt3
349 */
350
351 static inline void
OUT_PKT4(struct fd_ringbuffer * ring,uint16_t regindx,uint16_t cnt)352 OUT_PKT4(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
353 {
354 BEGIN_RING(ring, cnt + 1);
355 OUT_RING(ring, pm4_pkt4_hdr(regindx, cnt));
356 }
357
358 static inline void
OUT_PKT7(struct fd_ringbuffer * ring,uint8_t opcode,uint16_t cnt)359 OUT_PKT7(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
360 {
361 BEGIN_RING(ring, cnt + 1);
362 OUT_RING(ring, pm4_pkt7_hdr(opcode, cnt));
363 }
364
365 static inline void
OUT_WFI(struct fd_ringbuffer * ring)366 OUT_WFI(struct fd_ringbuffer *ring)
367 {
368 OUT_PKT3(ring, CP_WAIT_FOR_IDLE, 1);
369 OUT_RING(ring, 0x00000000);
370 }
371
372 static inline void
OUT_WFI5(struct fd_ringbuffer * ring)373 OUT_WFI5(struct fd_ringbuffer *ring)
374 {
375 OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
376 }
377
378 #ifdef __cplusplus
379 } /* end of extern "C" */
380 #endif
381
382 #endif /* FREEDRENO_RINGBUFFER_H_ */
383