1 /*
2 * Copyright 2012 Nouveau Project
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * Authors: Christoph Bumiller
23 */
24
25 #include "nvc0/nvc0_context.h"
26 #include "nvc0/nve4_compute.h"
27
28 #include "codegen/nv50_ir_driver.h"
29
30 #include "drf.h"
31 #include "qmd.h"
32 #include "cla0c0qmd.h"
33 #include "clc0c0qmd.h"
34 #include "clc3c0qmd.h"
35
36 #define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a)
37 #define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a)
38 #define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a)
39 #define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a)
40 #define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a)
41 #define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a)
42
43 int
nve4_screen_compute_setup(struct nvc0_screen * screen,struct nouveau_pushbuf * push)44 nve4_screen_compute_setup(struct nvc0_screen *screen,
45 struct nouveau_pushbuf *push)
46 {
47 struct nouveau_device *dev = screen->base.device;
48 struct nouveau_object *chan = screen->base.channel;
49 int i;
50 int ret;
51 uint32_t obj_class;
52 uint64_t address;
53
54 switch (dev->chipset & ~0xf) {
55 case 0x160:
56 obj_class = TU102_COMPUTE_CLASS;
57 break;
58 case 0x140:
59 obj_class = GV100_COMPUTE_CLASS;
60 break;
61 case 0x100:
62 case 0xf0:
63 obj_class = NVF0_COMPUTE_CLASS; /* GK110 */
64 break;
65 case 0xe0:
66 obj_class = NVE4_COMPUTE_CLASS; /* GK104 */
67 break;
68 case 0x110:
69 obj_class = GM107_COMPUTE_CLASS;
70 break;
71 case 0x120:
72 obj_class = GM200_COMPUTE_CLASS;
73 break;
74 case 0x130:
75 obj_class = (dev->chipset == 0x130 || dev->chipset == 0x13b) ?
76 GP100_COMPUTE_CLASS : GP104_COMPUTE_CLASS;
77 break;
78 default:
79 NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
80 return -1;
81 }
82
83 ret = nouveau_object_new(chan, 0xbeef00c0, obj_class, NULL, 0,
84 &screen->compute);
85 if (ret) {
86 NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret);
87 return ret;
88 }
89
90 BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
91 PUSH_DATA (push, screen->compute->oclass);
92
93 BEGIN_NVC0(push, NVE4_CP(TEMP_ADDRESS_HIGH), 2);
94 PUSH_DATAh(push, screen->tls->offset);
95 PUSH_DATA (push, screen->tls->offset);
96 /* No idea why there are 2. Divide size by 2 to be safe.
97 * Actually this might be per-MP TEMP size and looks like I'm only using
98 * 2 MPs instead of all 8.
99 */
100 BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(0)), 3);
101 PUSH_DATAh(push, screen->tls->size / screen->mp_count);
102 PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
103 PUSH_DATA (push, 0xff);
104 if (obj_class < GV100_COMPUTE_CLASS) {
105 BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3);
106 PUSH_DATAh(push, screen->tls->size / screen->mp_count);
107 PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
108 PUSH_DATA (push, 0xff);
109 }
110
111 /* Unified address space ? Who needs that ? Certainly not OpenCL.
112 *
113 * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be
114 * accessible. We cannot prevent that at the moment, so expect failure.
115 */
116 if (obj_class < GV100_COMPUTE_CLASS) {
117 BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
118 PUSH_DATA (push, 0xff << 24);
119 BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
120 PUSH_DATA (push, 0xfe << 24);
121
122 BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
123 PUSH_DATAh(push, screen->text->offset);
124 PUSH_DATA (push, screen->text->offset);
125 } else {
126 BEGIN_NVC0(push, SUBC_CP(0x2a0), 2);
127 PUSH_DATAh(push, 0xfeULL << 24);
128 PUSH_DATA (push, 0xfeULL << 24);
129 BEGIN_NVC0(push, SUBC_CP(0x7b0), 2);
130 PUSH_DATAh(push, 0xffULL << 24);
131 PUSH_DATA (push, 0xffULL << 24);
132 }
133
134 BEGIN_NVC0(push, SUBC_CP(0x0310), 1);
135 PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300);
136
137 /* NOTE: these do not affect the state used by the 3D object */
138 BEGIN_NVC0(push, NVE4_CP(TIC_ADDRESS_HIGH), 3);
139 PUSH_DATAh(push, screen->txc->offset);
140 PUSH_DATA (push, screen->txc->offset);
141 PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1);
142 BEGIN_NVC0(push, NVE4_CP(TSC_ADDRESS_HIGH), 3);
143 PUSH_DATAh(push, screen->txc->offset + 65536);
144 PUSH_DATA (push, screen->txc->offset + 65536);
145 PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1);
146
147 if (obj_class >= NVF0_COMPUTE_CLASS) {
148 /* The blob calls GK110_COMPUTE.FIRMWARE[0x6], along with the args (0x1)
149 * passed with GK110_COMPUTE.GRAPH.SCRATCH[0x2]. This is currently
150 * disabled because our firmware doesn't support these commands and the
151 * GPU hangs if they are used. */
152 BEGIN_NIC0(push, SUBC_CP(0x0248), 64);
153 for (i = 63; i >= 0; i--)
154 PUSH_DATA(push, 0x38000 | i);
155 IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);
156 }
157
158 BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1);
159 PUSH_DATA (push, 7); /* does not interfere with 3D */
160
161 /* Disabling this UNK command avoid a read fault when using texelFetch()
162 * from a compute shader for weird reasons.
163 if (obj_class == NVF0_COMPUTE_CLASS)
164 IMMED_NVC0(push, SUBC_CP(0x02c4), 1);
165 */
166
167 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
168
169 /* MS sample coordinate offsets: these do not work with _ALT modes ! */
170 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
171 PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO);
172 PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO);
173 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
174 PUSH_DATA (push, 64);
175 PUSH_DATA (push, 1);
176 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17);
177 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
178 PUSH_DATA (push, 0); /* 0 */
179 PUSH_DATA (push, 0);
180 PUSH_DATA (push, 1); /* 1 */
181 PUSH_DATA (push, 0);
182 PUSH_DATA (push, 0); /* 2 */
183 PUSH_DATA (push, 1);
184 PUSH_DATA (push, 1); /* 3 */
185 PUSH_DATA (push, 1);
186 PUSH_DATA (push, 2); /* 4 */
187 PUSH_DATA (push, 0);
188 PUSH_DATA (push, 3); /* 5 */
189 PUSH_DATA (push, 0);
190 PUSH_DATA (push, 2); /* 6 */
191 PUSH_DATA (push, 1);
192 PUSH_DATA (push, 3); /* 7 */
193 PUSH_DATA (push, 1);
194
195 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
196 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
197 PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
198 PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
199 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
200 PUSH_DATA (push, 28);
201 PUSH_DATA (push, 1);
202 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 8);
203 PUSH_DATA (push, 1);
204 PUSH_DATA (push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
205 PUSH_DATAh(push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
206 PUSH_DATA (push, screen->tls->offset);
207 PUSH_DATAh(push, screen->tls->offset);
208 PUSH_DATA (push, screen->tls->size / 2); /* MP TEMP block size */
209 PUSH_DATA (push, screen->tls->size / 2 / 64); /* warp TEMP block size */
210 PUSH_DATA (push, 0); /* warp cfstack size */
211 #endif
212
213 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
214 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
215
216 return 0;
217 }
218
219 static void
gm107_compute_validate_surfaces(struct nvc0_context * nvc0,struct pipe_image_view * view,int slot)220 gm107_compute_validate_surfaces(struct nvc0_context *nvc0,
221 struct pipe_image_view *view, int slot)
222 {
223 struct nv04_resource *res = nv04_resource(view->resource);
224 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
225 struct nvc0_screen *screen = nvc0->screen;
226 struct nouveau_bo *txc = nvc0->screen->txc;
227 struct nv50_tic_entry *tic;
228 uint64_t address;
229 const int s = 5;
230
231 tic = nv50_tic_entry(nvc0->images_tic[s][slot]);
232
233 res = nv04_resource(tic->pipe.texture);
234 nvc0_update_tic(nvc0, tic, res);
235
236 if (tic->id < 0) {
237 tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
238
239 /* upload the texture view */
240 PUSH_SPACE(push, 16);
241 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
242 PUSH_DATAh(push, txc->offset + (tic->id * 32));
243 PUSH_DATA (push, txc->offset + (tic->id * 32));
244 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
245 PUSH_DATA (push, 32);
246 PUSH_DATA (push, 1);
247 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9);
248 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
249 PUSH_DATAp(push, &tic->tic[0], 8);
250
251 BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), 1);
252 PUSH_DATA (push, (tic->id << 4) | 1);
253 } else
254 if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
255 BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), 1);
256 PUSH_DATA (push, (tic->id << 4) | 1);
257 }
258 nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
259
260 res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
261 res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
262
263 BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
264
265 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
266
267 /* upload the texture handle */
268 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
269 PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(slot + 32));
270 PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(slot + 32));
271 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
272 PUSH_DATA (push, 4);
273 PUSH_DATA (push, 0x1);
274 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 2);
275 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
276 PUSH_DATA (push, tic->id);
277
278 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
279 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
280 }
281
282 static void
nve4_compute_validate_surfaces(struct nvc0_context * nvc0)283 nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
284 {
285 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
286 uint64_t address;
287 const int s = 5;
288 int i, j;
289
290 if (!nvc0->images_dirty[s])
291 return;
292
293 address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
294
295 for (i = 0; i < NVC0_MAX_IMAGES; ++i) {
296 struct pipe_image_view *view = &nvc0->images[s][i];
297
298 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
299 PUSH_DATAh(push, address + NVC0_CB_AUX_SU_INFO(i));
300 PUSH_DATA (push, address + NVC0_CB_AUX_SU_INFO(i));
301 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
302 PUSH_DATA (push, 16 * 4);
303 PUSH_DATA (push, 0x1);
304 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 16);
305 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
306
307 if (view->resource) {
308 struct nv04_resource *res = nv04_resource(view->resource);
309
310 if (res->base.target == PIPE_BUFFER) {
311 if (view->access & PIPE_IMAGE_ACCESS_WRITE)
312 nvc0_mark_image_range_valid(view);
313 }
314
315 nve4_set_surface_info(push, view, nvc0);
316 BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
317
318 if (nvc0->screen->base.class_3d >= GM107_3D_CLASS)
319 gm107_compute_validate_surfaces(nvc0, view, i);
320 } else {
321 for (j = 0; j < 16; j++)
322 PUSH_DATA(push, 0);
323 }
324 }
325 }
326
327 /* Thankfully, textures with samplers follow the normal rules. */
328 static void
nve4_compute_validate_samplers(struct nvc0_context * nvc0)329 nve4_compute_validate_samplers(struct nvc0_context *nvc0)
330 {
331 bool need_flush = nve4_validate_tsc(nvc0, 5);
332 if (need_flush) {
333 BEGIN_NVC0(nvc0->base.pushbuf, NVE4_CP(TSC_FLUSH), 1);
334 PUSH_DATA (nvc0->base.pushbuf, 0);
335 }
336
337 /* Invalidate all 3D samplers because they are aliased. */
338 for (int s = 0; s < 5; s++)
339 nvc0->samplers_dirty[s] = ~0;
340 nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS;
341 }
342
343 /* (Code duplicated at bottom for various non-convincing reasons.
344 * E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC
345 * entries to avoid a subchannel switch.
346 * Same for texture cache flushes.
347 * Also, the bufctx differs, and more IFs in the 3D version looks ugly.)
348 */
349 static void nve4_compute_validate_textures(struct nvc0_context *);
350
351 static void
nve4_compute_set_tex_handles(struct nvc0_context * nvc0)352 nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
353 {
354 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
355 struct nvc0_screen *screen = nvc0->screen;
356 uint64_t address;
357 const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
358 unsigned i, n;
359 uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
360
361 if (!dirty)
362 return;
363 i = ffs(dirty) - 1;
364 n = util_logbase2(dirty) + 1 - i;
365 assert(n);
366
367 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
368
369 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
370 PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i));
371 PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i));
372 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
373 PUSH_DATA (push, n * 4);
374 PUSH_DATA (push, 0x1);
375 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + n);
376 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
377 PUSH_DATAp(push, &nvc0->tex_handles[s][i], n);
378
379 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
380 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
381
382 nvc0->textures_dirty[s] = 0;
383 nvc0->samplers_dirty[s] = 0;
384 }
385
386 static void
nve4_compute_validate_constbufs(struct nvc0_context * nvc0)387 nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
388 {
389 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
390 const int s = 5;
391
392 while (nvc0->constbuf_dirty[s]) {
393 int i = ffs(nvc0->constbuf_dirty[s]) - 1;
394 nvc0->constbuf_dirty[s] &= ~(1 << i);
395
396 if (nvc0->constbuf[s][i].user) {
397 struct nouveau_bo *bo = nvc0->screen->uniform_bo;
398 const unsigned base = NVC0_CB_USR_INFO(s);
399 const unsigned size = nvc0->constbuf[s][0].size;
400 assert(i == 0); /* we really only want OpenGL uniforms here */
401 assert(nvc0->constbuf[s][0].u.data);
402
403 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
404 PUSH_DATAh(push, bo->offset + base);
405 PUSH_DATA (push, bo->offset + base);
406 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
407 PUSH_DATA (push, size);
408 PUSH_DATA (push, 0x1);
409 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4));
410 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
411 PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4);
412 }
413 else {
414 struct nv04_resource *res =
415 nv04_resource(nvc0->constbuf[s][i].u.buf);
416 if (res) {
417 uint64_t address
418 = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
419
420 /* constbufs above 0 will are fetched via ubo info in the shader */
421 if (i > 0) {
422 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
423 PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
424 PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
425 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
426 PUSH_DATA (push, 4 * 4);
427 PUSH_DATA (push, 0x1);
428 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
429 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
430
431 PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
432 PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
433 PUSH_DATA (push, nvc0->constbuf[s][i].size);
434 PUSH_DATA (push, 0);
435 }
436
437 BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
438 res->cb_bindings[s] |= 1 << i;
439 }
440 }
441 }
442
443 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
444 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
445 }
446
447 static void
nve4_compute_validate_buffers(struct nvc0_context * nvc0)448 nve4_compute_validate_buffers(struct nvc0_context *nvc0)
449 {
450 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
451 uint64_t address;
452 const int s = 5;
453 int i;
454
455 address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
456
457 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
458 PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0));
459 PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0));
460 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
461 PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4);
462 PUSH_DATA (push, 0x1);
463 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS);
464 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
465
466 for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
467 if (nvc0->buffers[s][i].buffer) {
468 struct nv04_resource *res =
469 nv04_resource(nvc0->buffers[s][i].buffer);
470 PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
471 PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
472 PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
473 PUSH_DATA (push, 0);
474 BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR);
475 util_range_add(&res->base, &res->valid_buffer_range,
476 nvc0->buffers[s][i].buffer_offset,
477 nvc0->buffers[s][i].buffer_offset +
478 nvc0->buffers[s][i].buffer_size);
479 } else {
480 PUSH_DATA (push, 0);
481 PUSH_DATA (push, 0);
482 PUSH_DATA (push, 0);
483 PUSH_DATA (push, 0);
484 }
485 }
486 }
487
488 static struct nvc0_state_validate
489 validate_list_cp[] = {
490 { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM },
491 { nve4_compute_validate_textures, NVC0_NEW_CP_TEXTURES },
492 { nve4_compute_validate_samplers, NVC0_NEW_CP_SAMPLERS },
493 { nve4_compute_set_tex_handles, NVC0_NEW_CP_TEXTURES |
494 NVC0_NEW_CP_SAMPLERS },
495 { nve4_compute_validate_surfaces, NVC0_NEW_CP_SURFACES },
496 { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS },
497 { nve4_compute_validate_buffers, NVC0_NEW_CP_BUFFERS },
498 { nve4_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF },
499 };
500
501 static bool
nve4_state_validate_cp(struct nvc0_context * nvc0,uint32_t mask)502 nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
503 {
504 bool ret;
505
506 ret = nvc0_state_validate(nvc0, mask, validate_list_cp,
507 ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp,
508 nvc0->bufctx_cp);
509
510 if (unlikely(nvc0->state.flushed))
511 nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
512 return ret;
513 }
514
515 static void
nve4_compute_upload_input(struct nvc0_context * nvc0,const struct pipe_grid_info * info)516 nve4_compute_upload_input(struct nvc0_context *nvc0,
517 const struct pipe_grid_info *info)
518 {
519 struct nvc0_screen *screen = nvc0->screen;
520 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
521 struct nvc0_program *cp = nvc0->compprog;
522 uint64_t address;
523
524 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
525
526 if (cp->parm_size) {
527 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
528 PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5));
529 PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5));
530 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
531 PUSH_DATA (push, cp->parm_size);
532 PUSH_DATA (push, 0x1);
533 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + DIV_ROUND_UP(cp->parm_size, 4));
534 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
535 PUSH_DATAb(push, info->input, cp->parm_size);
536 }
537 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
538 PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO(0));
539 PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO(0));
540 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
541 PUSH_DATA (push, 8 * 4);
542 PUSH_DATA (push, 0x1);
543
544 if (unlikely(info->indirect)) {
545 struct nv04_resource *res = nv04_resource(info->indirect);
546 uint32_t offset = res->offset + info->indirect_offset;
547
548 nouveau_pushbuf_space(push, 32, 0, 1);
549 PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
550
551 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8);
552 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
553 PUSH_DATAp(push, info->block, 3);
554 nouveau_pushbuf_data(push, res->bo, offset,
555 NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
556 } else {
557 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8);
558 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
559 PUSH_DATAp(push, info->block, 3);
560 PUSH_DATAp(push, info->grid, 3);
561 }
562 PUSH_DATA (push, 0);
563 PUSH_DATA (push, info->work_dim);
564
565 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
566 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
567 }
568
569 static inline void
gp100_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,struct nouveau_bo * bo,uint32_t base,uint32_t size)570 gp100_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
571 struct nouveau_bo *bo, uint32_t base, uint32_t size)
572 {
573 uint64_t address = bo->offset + base;
574
575 assert(index < 8);
576 assert(!(base & 0xff));
577
578 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
579 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
580 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index,
581 DIV_ROUND_UP(size, 16));
582 NVC0C0_QMDV02_01_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
583 }
584
585 static inline void
nve4_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,struct nouveau_bo * bo,uint32_t base,uint32_t size)586 nve4_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, struct nouveau_bo *bo,
587 uint32_t base, uint32_t size)
588 {
589 uint64_t address = bo->offset + base;
590
591 assert(index < 8);
592 assert(!(base & 0xff));
593
594 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
595 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
596 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_SIZE, index, size);
597 NVA0C0_QMDV00_06_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
598 }
599
600 static void
nve4_compute_setup_buf_cb(struct nvc0_context * nvc0,bool gp100,void * desc)601 nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
602 {
603 // only user constant buffers 0-6 can be put in the descriptor, the rest are
604 // loaded through global memory
605 for (int i = 0; i <= 6; i++) {
606 if (nvc0->constbuf[5][i].user || !nvc0->constbuf[5][i].u.buf)
607 continue;
608
609 struct nv04_resource *res =
610 nv04_resource(nvc0->constbuf[5][i].u.buf);
611
612 uint32_t base = res->offset + nvc0->constbuf[5][i].offset;
613 uint32_t size = nvc0->constbuf[5][i].size;
614 if (gp100)
615 gp100_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
616 else
617 nve4_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
618 }
619
620 // there is no need to do FLUSH(NVE4_COMPUTE_FLUSH_CB) because
621 // nve4_compute_upload_input() does it later
622 }
623
624 static void
nve4_compute_setup_launch_desc(struct nvc0_context * nvc0,uint32_t * qmd,const struct pipe_grid_info * info)625 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
626 const struct pipe_grid_info *info)
627 {
628 const struct nvc0_screen *screen = nvc0->screen;
629 const struct nvc0_program *cp = nvc0->compprog;
630
631 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, TRUE);
632 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, TRUE);
633 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_DATA_CACHE, TRUE);
634 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_DATA_CACHE, TRUE);
635 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, TRUE);
636 NVA0C0_QMDV00_06_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR);
637 NVA0C0_QMDV00_06_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR);
638 NVA0C0_QMDV00_06_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
639 NVA0C0_QMDV00_06_VAL_SET(qmd, SASS_VERSION, 0x30);
640
641 NVA0C0_QMDV00_06_VAL_SET(qmd, PROGRAM_OFFSET, cp->code_base);
642
643 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
644 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
645 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
646 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
647 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
648 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
649
650 NVA0C0_QMDV00_06_VAL_SET(qmd, SHARED_MEMORY_SIZE,
651 align(cp->cp.smem_size, 0x100));
652 NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE,
653 (cp->hdr[1] & 0xfffff0) +
654 align(cp->cp.lmem_size, 0x10));
655 NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
656 NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800);
657
658 if (cp->cp.smem_size > (32 << 10))
659 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
660 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
661 else
662 if (cp->cp.smem_size > (16 << 10))
663 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
664 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB);
665 else
666 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
667 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB);
668
669 NVA0C0_QMDV00_06_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs);
670 NVA0C0_QMDV00_06_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
671
672 // Only bind user uniforms and the driver constant buffer through the
673 // launch descriptor because UBOs are sticked to the driver cb to avoid the
674 // limitation of 8 CBs.
675 if (nvc0->constbuf[5][0].user || cp->parm_size) {
676 nve4_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
677 NVC0_CB_USR_INFO(5), 1 << 16);
678
679 // Later logic will attempt to bind a real buffer at position 0. That
680 // should not happen if we've bound a user buffer.
681 assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
682 }
683 nve4_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
684 NVC0_CB_AUX_INFO(5), 1 << 11);
685
686 nve4_compute_setup_buf_cb(nvc0, false, qmd);
687 }
688
689 static void
gp100_compute_setup_launch_desc(struct nvc0_context * nvc0,uint32_t * qmd,const struct pipe_grid_info * info)690 gp100_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
691 const struct pipe_grid_info *info)
692 {
693 const struct nvc0_screen *screen = nvc0->screen;
694 const struct nvc0_program *cp = nvc0->compprog;
695
696 NVC0C0_QMDV02_01_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
697 NVC0C0_QMDV02_01_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR);
698 NVC0C0_QMDV02_01_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR);
699 NVC0C0_QMDV02_01_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
700
701 NVC0C0_QMDV02_01_VAL_SET(qmd, PROGRAM_OFFSET, cp->code_base);
702
703 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
704 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
705 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
706 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
707 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
708 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
709
710 NVC0C0_QMDV02_01_VAL_SET(qmd, SHARED_MEMORY_SIZE,
711 align(cp->cp.smem_size, 0x100));
712 NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE,
713 (cp->hdr[1] & 0xfffff0) +
714 align(cp->cp.lmem_size, 0x10));
715 NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
716 NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800);
717
718 NVC0C0_QMDV02_01_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs);
719 NVC0C0_QMDV02_01_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
720
721 // Only bind user uniforms and the driver constant buffer through the
722 // launch descriptor because UBOs are sticked to the driver cb to avoid the
723 // limitation of 8 CBs.
724 if (nvc0->constbuf[5][0].user || cp->parm_size) {
725 gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
726 NVC0_CB_USR_INFO(5), 1 << 16);
727
728 // Later logic will attempt to bind a real buffer at position 0. That
729 // should not happen if we've bound a user buffer.
730 assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
731 }
732 gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
733 NVC0_CB_AUX_INFO(5), 1 << 11);
734
735 nve4_compute_setup_buf_cb(nvc0, true, qmd);
736 }
737
738 static int
gv100_sm_config_smem_size(u32 size)739 gv100_sm_config_smem_size(u32 size)
740 {
741 if (size > 64 * 1024) size = 96 * 1024;
742 else if (size > 32 * 1024) size = 64 * 1024;
743 else if (size > 16 * 1024) size = 32 * 1024;
744 else if (size > 8 * 1024) size = 16 * 1024;
745 else size = 8 * 1024;
746 return (size / 4096) + 1;
747 }
748
749 static void
gv100_compute_setup_launch_desc(struct nvc0_context * nvc0,u32 * qmd,const struct pipe_grid_info * info)750 gv100_compute_setup_launch_desc(struct nvc0_context *nvc0, u32 *qmd,
751 const struct pipe_grid_info *info)
752 {
753 struct nvc0_program *cp = nvc0->compprog;
754 struct nvc0_screen *screen = nvc0->screen;
755 uint64_t entry = screen->text->offset + cp->code_base;
756
757 NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
758 NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
759 NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY);
760 NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE,
761 align(cp->cp.smem_size, 0x100));
762 NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE,
763 (cp->hdr[1] & 0xfffff0) +
764 align(cp->cp.lmem_size, 0x10));
765 NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
766 NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
767 gv100_sm_config_smem_size(8 * 1024));
768 NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE,
769 gv100_sm_config_smem_size(96 * 1024));
770 NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_VERSION, 2);
771 NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_MAJOR_VERSION, 2);
772 NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE,
773 gv100_sm_config_smem_size(cp->cp.smem_size));
774
775 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
776 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
777 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
778 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
779 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
780 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
781 NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, cp->num_gprs);
782 NVC3C0_QMDV02_02_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
783
784 // Only bind user uniforms and the driver constant buffer through the
785 // launch descriptor because UBOs are sticked to the driver cb to avoid the
786 // limitation of 8 CBs.
787 if (nvc0->constbuf[5][0].user || cp->parm_size) {
788 gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
789 NVC0_CB_USR_INFO(5), 1 << 16);
790
791 // Later logic will attempt to bind a real buffer at position 0. That
792 // should not happen if we've bound a user buffer.
793 assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
794 }
795 gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
796 NVC0_CB_AUX_INFO(5), 1 << 11);
797
798 nve4_compute_setup_buf_cb(nvc0, true, qmd);
799
800 NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, entry & 0xffffffff);
801 NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, entry >> 32);
802 }
803
804 static inline void *
nve4_compute_alloc_launch_desc(struct nouveau_context * nv,struct nouveau_bo ** pbo,uint64_t * pgpuaddr)805 nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
806 struct nouveau_bo **pbo, uint64_t *pgpuaddr)
807 {
808 uint8_t *ptr = nouveau_scratch_get(nv, 512, pgpuaddr, pbo);
809 if (!ptr)
810 return NULL;
811 if (*pgpuaddr & 255) {
812 unsigned adj = 256 - (*pgpuaddr & 255);
813 ptr += adj;
814 *pgpuaddr += adj;
815 }
816 memset(ptr, 0x00, 256);
817 return ptr;
818 }
819
820 static void
nve4_upload_indirect_desc(struct nouveau_pushbuf * push,struct nv04_resource * res,uint64_t gpuaddr,uint32_t length,uint32_t bo_offset)821 nve4_upload_indirect_desc(struct nouveau_pushbuf *push,
822 struct nv04_resource *res, uint64_t gpuaddr,
823 uint32_t length, uint32_t bo_offset)
824 {
825 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
826 PUSH_DATAh(push, gpuaddr);
827 PUSH_DATA (push, gpuaddr);
828 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
829 PUSH_DATA (push, length);
830 PUSH_DATA (push, 1);
831
832 nouveau_pushbuf_space(push, 32, 0, 1);
833 PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
834
835 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (length / 4));
836 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
837 nouveau_pushbuf_data(push, res->bo, bo_offset,
838 NVC0_IB_ENTRY_1_NO_PREFETCH | length);
839 }
840
841 void
nve4_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)842 nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
843 {
844 struct nvc0_context *nvc0 = nvc0_context(pipe);
845 struct nvc0_screen *screen = nvc0->screen;
846 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
847 void *desc;
848 uint64_t desc_gpuaddr;
849 struct nouveau_bo *desc_bo;
850 int ret;
851
852 desc = nve4_compute_alloc_launch_desc(&nvc0->base, &desc_bo, &desc_gpuaddr);
853 if (!desc) {
854 ret = -1;
855 goto out;
856 }
857 BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
858 desc_bo);
859
860 list_for_each_entry(struct nvc0_resident, resident, &nvc0->tex_head, list) {
861 nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS, resident->buf,
862 resident->flags);
863 }
864
865 list_for_each_entry(struct nvc0_resident, resident, &nvc0->img_head, list) {
866 nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS, resident->buf,
867 resident->flags);
868 }
869
870 ret = !nve4_state_validate_cp(nvc0, ~0);
871 if (ret)
872 goto out;
873
874 if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS)
875 gv100_compute_setup_launch_desc(nvc0, desc, info);
876 else
877 if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
878 gp100_compute_setup_launch_desc(nvc0, desc, info);
879 else
880 nve4_compute_setup_launch_desc(nvc0, desc, info);
881
882 nve4_compute_upload_input(nvc0, info);
883
884 #ifndef NDEBUG
885 if (debug_get_num_option("NV50_PROG_DEBUG", 0)) {
886 debug_printf("Queue Meta Data:\n");
887 if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS)
888 NVC3C0QmdDump_V02_02(desc);
889 else
890 if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
891 NVC0C0QmdDump_V02_01(desc);
892 else
893 NVA0C0QmdDump_V00_06(desc);
894 }
895 #endif
896
897 if (unlikely(info->indirect)) {
898 struct nv04_resource *res = nv04_resource(info->indirect);
899 uint32_t offset = res->offset + info->indirect_offset;
900
901 /* upload the descriptor */
902 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
903 PUSH_DATAh(push, desc_gpuaddr);
904 PUSH_DATA (push, desc_gpuaddr);
905 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
906 PUSH_DATA (push, 256);
907 PUSH_DATA (push, 1);
908 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
909 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
910 PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
911
912 if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS) {
913 nve4_upload_indirect_desc(push, res, desc_gpuaddr + 48, 12, offset);
914 } else {
915 /* overwrite griddim_x and griddim_y as two 32-bits integers even
916 * if griddim_y must be a 16-bits integer */
917 nve4_upload_indirect_desc(push, res, desc_gpuaddr + 48, 8, offset);
918
919 /* overwrite the 16 high bits of griddim_y with griddim_z because
920 * we need (z << 16) | x */
921 nve4_upload_indirect_desc(push, res, desc_gpuaddr + 54, 4, offset + 8);
922 }
923 }
924
925 /* upload descriptor and flush */
926 nouveau_pushbuf_space(push, 32, 1, 0);
927 PUSH_REFN(push, screen->text, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD);
928 BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
929 PUSH_DATA (push, desc_gpuaddr >> 8);
930 BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
931 PUSH_DATA (push, 0x3);
932 BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
933 PUSH_DATA (push, 0);
934
935 nvc0_update_compute_invocations_counter(nvc0, info);
936
937 out:
938 if (ret)
939 NOUVEAU_ERR("Failed to launch grid !\n");
940 nouveau_scratch_done(&nvc0->base);
941 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_DESC);
942 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS);
943 }
944
945
946 #define NVE4_TIC_ENTRY_INVALID 0x000fffff
947
948 static void
nve4_compute_validate_textures(struct nvc0_context * nvc0)949 nve4_compute_validate_textures(struct nvc0_context *nvc0)
950 {
951 struct nouveau_bo *txc = nvc0->screen->txc;
952 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
953 const unsigned s = 5;
954 unsigned i;
955 uint32_t commands[2][32];
956 unsigned n[2] = { 0, 0 };
957
958 for (i = 0; i < nvc0->num_textures[s]; ++i) {
959 struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
960 struct nv04_resource *res;
961 const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
962
963 if (!tic) {
964 nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
965 continue;
966 }
967 res = nv04_resource(tic->pipe.texture);
968 nvc0_update_tic(nvc0, tic, res);
969
970 if (tic->id < 0) {
971 tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
972
973 PUSH_SPACE(push, 16);
974 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
975 PUSH_DATAh(push, txc->offset + (tic->id * 32));
976 PUSH_DATA (push, txc->offset + (tic->id * 32));
977 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
978 PUSH_DATA (push, 32);
979 PUSH_DATA (push, 1);
980 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9);
981 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
982 PUSH_DATAp(push, &tic->tic[0], 8);
983
984 commands[0][n[0]++] = (tic->id << 4) | 1;
985 } else
986 if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
987 commands[1][n[1]++] = (tic->id << 4) | 1;
988 }
989 nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
990
991 res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
992 res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
993
994 nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID;
995 nvc0->tex_handles[s][i] |= tic->id;
996 if (dirty)
997 BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD);
998 }
999 for (; i < nvc0->state.num_textures[s]; ++i) {
1000 nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
1001 nvc0->textures_dirty[s] |= 1 << i;
1002 }
1003
1004 if (n[0]) {
1005 BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), n[0]);
1006 PUSH_DATAp(push, commands[0], n[0]);
1007 }
1008 if (n[1]) {
1009 BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), n[1]);
1010 PUSH_DATAp(push, commands[1], n[1]);
1011 }
1012
1013 nvc0->state.num_textures[s] = nvc0->num_textures[s];
1014
1015 /* Invalidate all 3D textures because they are aliased. */
1016 for (int s = 0; s < 5; s++) {
1017 for (int i = 0; i < nvc0->num_textures[s]; i++)
1018 nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i));
1019 nvc0->textures_dirty[s] = ~0;
1020 }
1021 nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES;
1022 }
1023
1024 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
1025 static void
nve4_compute_trap_info(struct nvc0_context * nvc0)1026 nve4_compute_trap_info(struct nvc0_context *nvc0)
1027 {
1028 struct nvc0_screen *screen = nvc0->screen;
1029 struct nouveau_bo *bo = screen->parm;
1030 int ret, i;
1031 volatile struct nve4_mp_trap_info *info;
1032 uint8_t *map;
1033
1034 ret = nouveau_bo_map(bo, NOUVEAU_BO_RDWR, nvc0->base.client);
1035 if (ret)
1036 return;
1037 map = (uint8_t *)bo->map;
1038 info = (volatile struct nve4_mp_trap_info *)(map + NVE4_CP_PARAM_TRAP_INFO);
1039
1040 if (info->lock) {
1041 debug_printf("trapstat = %08x\n", info->trapstat);
1042 debug_printf("warperr = %08x\n", info->warperr);
1043 debug_printf("PC = %x\n", info->pc);
1044 debug_printf("tid = %u %u %u\n",
1045 info->tid[0], info->tid[1], info->tid[2]);
1046 debug_printf("ctaid = %u %u %u\n",
1047 info->ctaid[0], info->ctaid[1], info->ctaid[2]);
1048 for (i = 0; i <= 63; ++i)
1049 debug_printf("$r%i = %08x\n", i, info->r[i]);
1050 for (i = 0; i <= 6; ++i)
1051 debug_printf("$p%i = %i\n", i, (info->flags >> i) & 1);
1052 debug_printf("$c = %x\n", info->flags >> 12);
1053 }
1054 info->lock = 0;
1055 }
1056 #endif
1057