1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9 #include <stdlib.h>
10 #include <string.h>
11 #include <inttypes.h>
12
13 #include "common/util.h"
14 #include "common/macros.h"
15 #include "common/vk/assert_vk.h"
16
17 #include "hs_vk.h"
18 #include "hs_vk_target.h"
19
20 //
21 // We want concurrent kernel execution to occur in a few places.
22 //
23 // The summary is:
24 //
25 // 1) If necessary, some max valued keys are written to the end of
26 // the vin/vout buffers.
27 //
28 // 2) Blocks of slabs of keys are sorted.
29 //
30 // 3) If necesary, the blocks of slabs are merged until complete.
31 //
32 // 4) If requested, the slabs will be converted from slab ordering
33 // to linear ordering.
34 //
35 // Below is the general "happens-before" relationship between HotSort
36 // compute kernels.
37 //
38 // Note the diagram assumes vin and vout are different buffers. If
39 // they're not, then the first merge doesn't include the pad_vout
40 // event in the wait list.
41 //
42 // +----------+ +---------+
43 // | pad_vout | | pad_vin |
44 // +----+-----+ +----+----+
45 // | |
46 // | WAITFOR(pad_vin)
47 // | |
48 // | +-----v-----+
49 // | | |
50 // | +----v----+ +----v----+
51 // | | bs_full | | bs_frac |
52 // | +----+----+ +----+----+
53 // | | |
54 // | +-----v-----+
55 // | |
56 // | +------NO------JUST ONE BLOCK?
57 // | / |
58 // |/ YES
59 // + |
60 // | v
61 // | END_WITH_EVENTS(bs_full,bs_frac)
62 // |
63 // |
64 // WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<<
65 // |
66 // |
67 // +-----------<------------+
68 // | |
69 // +-----v-----+ |
70 // | | |
71 // +----v----+ +----v----+ |
72 // | fm_full | | fm_frac | |
73 // +----+----+ +----+----+ |
74 // | | ^
75 // +-----v-----+ |
76 // | |
77 // WAITFOR(fm_full,fm_frac) |
78 // | |
79 // v |
80 // +--v--+ WAITFOR(bc)
81 // | hm | |
82 // +-----+ |
83 // | |
84 // WAITFOR(hm) |
85 // | ^
86 // +--v--+ |
87 // | bc | |
88 // +-----+ |
89 // | |
90 // v |
91 // MERGING COMPLETE?-------NO------+
92 // |
93 // YES
94 // |
95 // v
96 // END_WITH_EVENTS(bc)
97 //
98
99 struct hs_vk
100 {
101 VkAllocationCallbacks const * allocator;
102 VkDevice device;
103
104 struct {
105 struct {
106 VkDescriptorSetLayout vout_vin;
107 } layout;
108 } desc_set;
109
110 struct {
111 struct {
112 VkPipelineLayout vout_vin;
113 } layout;
114 } pipeline;
115
116 struct hs_vk_target_config config;
117
118 uint32_t key_val_size;
119 uint32_t slab_keys;
120 uint32_t bs_slabs_log2_ru;
121 uint32_t bc_slabs_log2_max;
122
123 struct {
124 uint32_t count;
125 VkPipeline * bs;
126 VkPipeline * bc;
127 VkPipeline * fm[3];
128 VkPipeline * hm[3];
129 VkPipeline * transpose;
130 VkPipeline all[];
131 } pipelines;
132 };
133
134 //
135 //
136 //
137
138 struct hs_state
139 {
140 VkCommandBuffer cb;
141
142 // If sorting in-place, then vout == vin
143 VkBuffer vout;
144 VkBuffer vin;
145
146 // bx_ru is number of rounded up warps in vin
147 uint32_t bx_ru;
148 };
149
150 //
151 //
152 //
153
154 static
155 void
hs_barrier_compute_w_to_compute_r(struct hs_state * const state)156 hs_barrier_compute_w_to_compute_r(struct hs_state * const state)
157 {
158 static VkMemoryBarrier const shader_w_to_r = {
159 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
160 .pNext = NULL,
161 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
162 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
163 };
164
165 vkCmdPipelineBarrier(state->cb,
166 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
167 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
168 0,
169 1,
170 &shader_w_to_r,
171 0,
172 NULL,
173 0,
174 NULL);
175 }
176
177 //
178 //
179 //
180
181 static
182 void
hs_barrier_to_compute_r(struct hs_state * const state,VkPipelineStageFlags const src_stage,VkAccessFlagBits const src_access)183 hs_barrier_to_compute_r(struct hs_state * const state,
184 VkPipelineStageFlags const src_stage,
185 VkAccessFlagBits const src_access)
186 {
187 if (src_stage == 0)
188 return;
189
190 VkMemoryBarrier const compute_r = {
191 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
192 .pNext = NULL,
193 .srcAccessMask = src_access,
194 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
195 };
196
197 vkCmdPipelineBarrier(state->cb,
198 src_stage,
199 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
200 0,
201 1,
202 &compute_r,
203 0,
204 NULL,
205 0,
206 NULL);
207 }
208
209 //
210 //
211 //
212
213 static
214 void
hs_barrier_to_transfer_fill(struct hs_state * const state,VkPipelineStageFlags const src_stage,VkAccessFlagBits const src_access)215 hs_barrier_to_transfer_fill(struct hs_state * const state,
216 VkPipelineStageFlags const src_stage,
217 VkAccessFlagBits const src_access)
218 {
219 if (src_stage == 0)
220 return;
221
222 VkMemoryBarrier const fill_w = {
223 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
224 .pNext = NULL,
225 .srcAccessMask = src_access,
226 .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT
227 };
228
229 vkCmdPipelineBarrier(state->cb,
230 src_stage,
231 VK_PIPELINE_STAGE_TRANSFER_BIT,
232 0,
233 1,
234 &fill_w,
235 0,
236 NULL,
237 0,
238 NULL);
239 }
240
241 //
242 //
243 //
244
245 static
246 void
hs_transpose(struct hs_vk const * const hs,struct hs_state * const state)247 hs_transpose(struct hs_vk const * const hs,
248 struct hs_state * const state)
249 {
250 hs_barrier_compute_w_to_compute_r(state);
251
252 vkCmdBindPipeline(state->cb,
253 VK_PIPELINE_BIND_POINT_COMPUTE,
254 hs->pipelines.transpose[0]);
255
256 vkCmdDispatch(state->cb,state->bx_ru,1,1);
257 }
258
259 //
260 //
261 //
262
263 static
264 void
hs_bc(struct hs_vk const * const hs,struct hs_state * const state,uint32_t const down_slabs,uint32_t const clean_slabs_log2)265 hs_bc(struct hs_vk const * const hs,
266 struct hs_state * const state,
267 uint32_t const down_slabs,
268 uint32_t const clean_slabs_log2)
269 {
270 hs_barrier_compute_w_to_compute_r(state);
271
272 // block clean the minimal number of down_slabs_log2 spans
273 uint32_t const frac_ru = (1u << clean_slabs_log2) - 1;
274 uint32_t const full_bc = (down_slabs + frac_ru) >> clean_slabs_log2;
275
276 vkCmdBindPipeline(state->cb,
277 VK_PIPELINE_BIND_POINT_COMPUTE,
278 hs->pipelines.bc[clean_slabs_log2]);
279
280 vkCmdDispatch(state->cb,full_bc,1,1);
281 }
282
283 //
284 //
285 //
286
287 static
288 uint32_t
hs_hm(struct hs_vk const * const hs,struct hs_state * const state,uint32_t const down_slabs,uint32_t const clean_slabs_log2)289 hs_hm(struct hs_vk const * const hs,
290 struct hs_state * const state,
291 uint32_t const down_slabs,
292 uint32_t const clean_slabs_log2)
293 {
294 hs_barrier_compute_w_to_compute_r(state);
295
296 // how many scaled half-merge spans are there?
297 uint32_t const frac_ru = (1 << clean_slabs_log2) - 1;
298 uint32_t const spans = (down_slabs + frac_ru) >> clean_slabs_log2;
299
300 // for now, just clamp to the max
301 uint32_t const log2_rem = clean_slabs_log2 - hs->bc_slabs_log2_max;
302 uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem);
303 uint32_t const log2_out = log2_rem - scale_log2;
304
305 // size the grid
306 uint32_t const slab_span = hs->config.slab.height << log2_out;
307
308 vkCmdBindPipeline(state->cb,
309 VK_PIPELINE_BIND_POINT_COMPUTE,
310 hs->pipelines.hm[scale_log2][0]);
311
312 vkCmdDispatch(state->cb,slab_span,spans,1);
313
314 return log2_out;
315 }
316
317 //
318 // FIXME -- some of this logic can be skipped if BS is a power-of-two
319 //
320
321 static
322 uint32_t
hs_fm(struct hs_vk const * const hs,struct hs_state * const state,uint32_t * const down_slabs,uint32_t const up_scale_log2)323 hs_fm(struct hs_vk const * const hs,
324 struct hs_state * const state,
325 uint32_t * const down_slabs,
326 uint32_t const up_scale_log2)
327 {
328 //
329 // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes
330 // a performance win to bias toward launching the smaller flip merge
331 // kernel in order to get more warps in flight (increased
332 // occupancy). This is useful when merging small numbers of slabs.
333 //
334 // Note that HS_FM_SCALE_MIN will always be 0 or 1.
335 //
336 // So, for now, just clamp to the max until there is a reason to
337 // restore the fancier and probably low-impact approach.
338 //
339 uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2);
340 uint32_t const clean_log2 = up_scale_log2 - scale_log2;
341
342 // number of slabs in a full-sized scaled flip-merge span
343 uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2;
344
345 // how many full-sized scaled flip-merge spans are there?
346 uint32_t full_fm = state->bx_ru / full_span_slabs;
347 uint32_t frac_fm = 0;
348
349 // initialize down_slabs
350 *down_slabs = full_fm * full_span_slabs;
351
352 // how many half-size scaled + fractional scaled spans are there?
353 uint32_t const span_rem = state->bx_ru - *down_slabs;
354 uint32_t const half_span_slabs = full_span_slabs >> 1;
355
356 // if we have over a half-span then fractionally merge it
357 if (span_rem > half_span_slabs)
358 {
359 // the remaining slabs will be cleaned
360 *down_slabs += span_rem;
361
362 uint32_t const frac_rem = span_rem - half_span_slabs;
363 uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem);
364
365 if (frac_rem_pow2 >= half_span_slabs)
366 {
367 // bump it up to a full span
368 full_fm += 1;
369 }
370 else
371 {
372 // otherwise, add fractional
373 frac_fm = MAX_MACRO(1,frac_rem_pow2 >> clean_log2);
374 }
375 }
376
377 //
378 // Size the grid
379 //
380 // The simplifying choices below limit the maximum keys that can be
381 // sorted with this grid scheme to around ~2B.
382 //
383 // .x : slab height << clean_log2 -- this is the slab span
384 // .y : [1...65535] -- this is the slab index
385 // .z : ( this could also be used to further expand .y )
386 //
387 // Note that OpenCL declares a grid in terms of global threads and
388 // not grids and blocks
389 //
390
391 //
392 // size the grid
393 //
394 uint32_t const slab_span = hs->config.slab.height << clean_log2;
395
396 if (full_fm > 0)
397 {
398 uint32_t const full_idx = hs->bs_slabs_log2_ru - 1 + scale_log2;
399
400 vkCmdBindPipeline(state->cb,
401 VK_PIPELINE_BIND_POINT_COMPUTE,
402 hs->pipelines.fm[scale_log2][full_idx]);
403
404 vkCmdDispatch(state->cb,slab_span,full_fm,1);
405 }
406
407 if (frac_fm > 0)
408 {
409 vkCmdBindPipeline(state->cb,
410 VK_PIPELINE_BIND_POINT_COMPUTE,
411 hs->pipelines.fm[scale_log2][msb_idx_u32(frac_fm)]);
412
413 vkCmdDispatchBase(state->cb,
414 0,full_fm,0,
415 slab_span,1,1);
416 }
417
418 return clean_log2;
419 }
420
421 //
422 //
423 //
424
425 static
426 void
hs_bs(struct hs_vk const * const hs,struct hs_state * const state,uint32_t const count_padded_in)427 hs_bs(struct hs_vk const * const hs,
428 struct hs_state * const state,
429 uint32_t const count_padded_in)
430 {
431 uint32_t const slabs_in = count_padded_in / hs->slab_keys;
432 uint32_t const full_bs = slabs_in / hs->config.block.slabs;
433 uint32_t const frac_bs = slabs_in - full_bs * hs->config.block.slabs;
434
435 if (full_bs > 0)
436 {
437 vkCmdBindPipeline(state->cb,
438 VK_PIPELINE_BIND_POINT_COMPUTE,
439 hs->pipelines.bs[hs->bs_slabs_log2_ru]);
440
441 vkCmdDispatch(state->cb,full_bs,1,1);
442 }
443
444 if (frac_bs > 0)
445 {
446 uint32_t const frac_idx = msb_idx_u32(frac_bs);
447 uint32_t const full_to_frac_log2 = hs->bs_slabs_log2_ru - frac_idx;
448
449 vkCmdBindPipeline(state->cb,
450 VK_PIPELINE_BIND_POINT_COMPUTE,
451 hs->pipelines.bs[msb_idx_u32(frac_bs)]);
452
453 vkCmdDispatchBase(state->cb,
454 full_bs<<full_to_frac_log2,0,0,
455 1,1,1);
456 }
457 }
458
459 //
460 //
461 //
462
463 static
464 void
hs_keyset_pre_fm(struct hs_vk const * const hs,struct hs_state * const state,uint32_t const count_lo,uint32_t const count_hi)465 hs_keyset_pre_fm(struct hs_vk const * const hs,
466 struct hs_state * const state,
467 uint32_t const count_lo,
468 uint32_t const count_hi)
469 {
470 uint32_t const vout_span = count_hi - count_lo;
471
472 vkCmdFillBuffer(state->cb,
473 state->vout,
474 count_lo * hs->key_val_size,
475 vout_span * hs->key_val_size,
476 UINT32_MAX);
477 }
478
479 //
480 //
481 //
482
483 static
484 void
hs_keyset_pre_bs(struct hs_vk const * const hs,struct hs_state * const state,uint32_t const count,uint32_t const count_hi)485 hs_keyset_pre_bs(struct hs_vk const * const hs,
486 struct hs_state * const state,
487 uint32_t const count,
488 uint32_t const count_hi)
489 {
490 uint32_t const vin_span = count_hi - count;
491
492 vkCmdFillBuffer(state->cb,
493 state->vin,
494 count * hs->key_val_size,
495 vin_span * hs->key_val_size,
496 UINT32_MAX);
497 }
498
499 //
500 //
501 //
502
503 void
hs_vk_ds_bind(struct hs_vk const * const hs,VkDescriptorSet hs_ds,VkCommandBuffer cb,VkBuffer vin,VkBuffer vout)504 hs_vk_ds_bind(struct hs_vk const * const hs,
505 VkDescriptorSet hs_ds,
506 VkCommandBuffer cb,
507 VkBuffer vin,
508 VkBuffer vout)
509 {
510 //
511 // initialize the HotSort descriptor set
512 //
513 VkDescriptorBufferInfo const dbi[] = {
514 {
515 .buffer = vout == VK_NULL_HANDLE ? vin : vout,
516 .offset = 0,
517 .range = VK_WHOLE_SIZE
518 },
519 {
520 .buffer = vin,
521 .offset = 0,
522 .range = VK_WHOLE_SIZE
523 }
524 };
525
526 VkWriteDescriptorSet const wds[] = {
527 {
528 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
529 .pNext = NULL,
530 .dstSet = hs_ds,
531 .dstBinding = 0,
532 .dstArrayElement = 0,
533 .descriptorCount = 2,
534 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
535 .pImageInfo = NULL,
536 .pBufferInfo = dbi,
537 .pTexelBufferView = NULL
538 }
539 };
540
541 vkUpdateDescriptorSets(hs->device,
542 ARRAY_LENGTH_MACRO(wds),
543 wds,
544 0,
545 NULL);
546
547 //
548 // All HotSort kernels can use the same descriptor set:
549 //
550 // {
551 // HS_KEY_TYPE vout[];
552 // HS_KEY_TYPE vin[];
553 // }
554 //
555 // Note that only the bs() kernels read from vin().
556 //
557 vkCmdBindDescriptorSets(cb,
558 VK_PIPELINE_BIND_POINT_COMPUTE,
559 hs->pipeline.layout.vout_vin,
560 0,
561 1,
562 &hs_ds,
563 0,
564 NULL);
565 }
566
567 //
568 //
569 //
570
571 void
hs_vk_sort(struct hs_vk const * const hs,VkCommandBuffer cb,VkBuffer vin,VkPipelineStageFlags const vin_src_stage,VkAccessFlagBits const vin_src_access,VkBuffer vout,VkPipelineStageFlags const vout_src_stage,VkAccessFlagBits const vout_src_access,uint32_t const count,uint32_t const count_padded_in,uint32_t const count_padded_out,bool const linearize)572 hs_vk_sort(struct hs_vk const * const hs,
573 VkCommandBuffer cb,
574 VkBuffer vin,
575 VkPipelineStageFlags const vin_src_stage,
576 VkAccessFlagBits const vin_src_access,
577 VkBuffer vout,
578 VkPipelineStageFlags const vout_src_stage,
579 VkAccessFlagBits const vout_src_access,
580 uint32_t const count,
581 uint32_t const count_padded_in,
582 uint32_t const count_padded_out,
583 bool const linearize)
584 {
585 // is this sort in place?
586 bool const is_in_place = (vout == VK_NULL_HANDLE);
587
588 //
589 // create some common state
590 //
591 struct hs_state state = {
592 .cb = cb,
593 .vin = vin,
594 .vout = is_in_place ? vin : vout,
595 .bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys
596 };
597
598 // initialize vin
599 uint32_t const count_hi = is_in_place ? count_padded_out : count_padded_in;
600 bool const is_pre_sort_reqd = count_hi > count;
601 bool const is_pre_merge_reqd = !is_in_place && (count_padded_out > count_padded_in);
602
603 //
604 // pre-sort keyset needs to happen before bs()
605 // pre-merge keyset needs to happen before fm()
606 //
607
608 VkPipelineStageFlags bs_src_stage = 0;
609 VkAccessFlagBits bs_src_access = 0;
610
611 // initialize any trailing keys in vin before sorting
612 if (is_pre_sort_reqd)
613 {
614 hs_barrier_to_transfer_fill(&state,vin_src_stage,vin_src_access);
615
616 hs_keyset_pre_bs(hs,&state,count,count_hi);
617
618 bs_src_stage |= VK_PIPELINE_STAGE_TRANSFER_BIT;
619 bs_src_access |= VK_ACCESS_TRANSFER_WRITE_BIT;
620 }
621 else
622 {
623 bs_src_stage = vin_src_stage;
624 bs_src_access = vin_src_access;
625 }
626
627 hs_barrier_to_compute_r(&state,bs_src_stage,bs_src_access);
628
629 // sort blocks of slabs... after hs_keyset_pre_sort()
630 hs_bs(hs,&state,count_padded_in);
631
632 VkPipelineStageFlags fm_src_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
633 VkAccessFlagBits fm_src_access = VK_ACCESS_SHADER_READ_BIT;
634
635 // initialize any trailing keys in vout before merging
636 if (is_pre_merge_reqd)
637 {
638 hs_barrier_to_transfer_fill(&state,vout_src_stage,vout_src_access);
639
640 hs_keyset_pre_fm(hs,&state,count_padded_in,count_padded_out);
641
642 fm_src_stage |= VK_PIPELINE_STAGE_TRANSFER_BIT;
643 fm_src_access |= VK_ACCESS_TRANSFER_WRITE_BIT;
644 }
645 else
646 {
647 fm_src_stage |= vout_src_stage;
648 fm_src_access |= vout_src_access;
649 }
650
651 //
652 // if this was a single bs block then there is no merging
653 //
654 if (state.bx_ru > hs->config.block.slabs)
655 {
656 hs_barrier_to_compute_r(&state,fm_src_stage,fm_src_access);
657
658 //
659 // otherwise, merge sorted spans of slabs until done
660 //
661 int32_t up_scale_log2 = 1;
662
663 while (true)
664 {
665 uint32_t down_slabs;
666
667 // flip merge slabs -- return span of slabs that must be cleaned
668 uint32_t clean_slabs_log2 = hs_fm(hs,&state,
669 &down_slabs,
670 up_scale_log2);
671
672 // if span is gt largest slab block cleaner then half merge
673 while (clean_slabs_log2 > hs->bc_slabs_log2_max)
674 {
675 clean_slabs_log2 = hs_hm(hs,&state,
676 down_slabs,
677 clean_slabs_log2);
678 }
679
680 // launch clean slab grid -- is it the final launch?
681 hs_bc(hs,&state,down_slabs,clean_slabs_log2);
682
683 // was this the final block clean?
684 if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru)
685 break;
686
687 // otherwise, merge twice as many slabs
688 up_scale_log2 += 1;
689
690 // drop a barrier
691 hs_barrier_compute_w_to_compute_r(&state);
692 }
693 }
694
695 // slabs or linear?
696 if (linearize)
697 hs_transpose(hs,&state);
698 }
699
700 //
701 //
702 //
703
704 #ifdef HS_VK_VERBOSE_STATISTICS_AMD
705
706 #include <stdio.h>
707
708 static
709 void
hs_vk_verbose_statistics_amd(VkDevice device,struct hs_vk const * const hs)710 hs_vk_verbose_statistics_amd(VkDevice device, struct hs_vk const * const hs)
711 {
712 PFN_vkGetShaderInfoAMD vkGetShaderInfoAMD =
713 (PFN_vkGetShaderInfoAMD)
714 vkGetDeviceProcAddr(device,"vkGetShaderInfoAMD");
715
716 if (vkGetShaderInfoAMD == NULL)
717 return;
718
719 fprintf(stdout,
720 " PHY PHY AVAIL AVAIL\n"
721 "VGPRs SGPRs LDS_MAX LDS/WG SPILL VGPRs SGPRs VGPRs SGPRs WORKGROUP_SIZE\n");
722
723 for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
724 {
725 VkShaderStatisticsInfoAMD ssi_amd;
726 size_t ssi_amd_size = sizeof(ssi_amd);
727
728 if (vkGetShaderInfoAMD(hs->device,
729 hs->pipelines.all[ii],
730 VK_SHADER_STAGE_COMPUTE_BIT,
731 VK_SHADER_INFO_TYPE_STATISTICS_AMD,
732 &ssi_amd_size,
733 &ssi_amd) == VK_SUCCESS)
734 {
735 fprintf(stdout,
736 "%5" PRIu32 " "
737 "%5" PRIu32 " "
738 "%5" PRIu32 " "
739
740 "%6zu "
741 "%6zu "
742
743 "%5" PRIu32 " "
744 "%5" PRIu32 " "
745 "%5" PRIu32 " "
746 "%5" PRIu32 " "
747
748 "( %6" PRIu32 ", " "%6" PRIu32 ", " "%6" PRIu32 " )\n",
749 ssi_amd.resourceUsage.numUsedVgprs,
750 ssi_amd.resourceUsage.numUsedSgprs,
751 ssi_amd.resourceUsage.ldsSizePerLocalWorkGroup,
752 ssi_amd.resourceUsage.ldsUsageSizeInBytes, // size_t
753 ssi_amd.resourceUsage.scratchMemUsageInBytes, // size_t
754 ssi_amd.numPhysicalVgprs,
755 ssi_amd.numPhysicalSgprs,
756 ssi_amd.numAvailableVgprs,
757 ssi_amd.numAvailableSgprs,
758 ssi_amd.computeWorkGroupSize[0],
759 ssi_amd.computeWorkGroupSize[1],
760 ssi_amd.computeWorkGroupSize[2]);
761 }
762 }
763 }
764
765 #endif
766
767 //
768 //
769 //
770
771 #ifdef HS_VK_VERBOSE_DISASSEMBLY_AMD
772
773 #include <stdio.h>
774
775 static
776 void
hs_vk_verbose_disassembly_amd(VkDevice device,struct hs_vk const * const hs)777 hs_vk_verbose_disassembly_amd(VkDevice device, struct hs_vk const * const hs)
778 {
779 PFN_vkGetShaderInfoAMD vkGetShaderInfoAMD =
780 (PFN_vkGetShaderInfoAMD)
781 vkGetDeviceProcAddr(device,"vkGetShaderInfoAMD");
782
783 if (vkGetShaderInfoAMD == NULL)
784 return;
785
786 for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
787 {
788 size_t disassembly_amd_size;
789
790 if (vkGetShaderInfoAMD(hs->device,
791 hs->pipelines.all[ii],
792 VK_SHADER_STAGE_COMPUTE_BIT,
793 VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD,
794 &disassembly_amd_size,
795 NULL) == VK_SUCCESS)
796 {
797 void * disassembly_amd = malloc(disassembly_amd_size);
798
799 if (vkGetShaderInfoAMD(hs->device,
800 hs->pipelines.all[ii],
801 VK_SHADER_STAGE_COMPUTE_BIT,
802 VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD,
803 &disassembly_amd_size,
804 disassembly_amd) == VK_SUCCESS)
805 {
806 fprintf(stdout,"%s",(char*)disassembly_amd);
807 }
808
809 free(disassembly_amd);
810 }
811 }
812 }
813
814 #endif
815
816 //
817 //
818 //
819
820 struct hs_vk *
hs_vk_create(struct hs_vk_target const * const target,VkDevice device,VkAllocationCallbacks const * allocator,VkPipelineCache pipeline_cache)821 hs_vk_create(struct hs_vk_target const * const target,
822 VkDevice device,
823 VkAllocationCallbacks const * allocator,
824 VkPipelineCache pipeline_cache)
825 {
826 //
827 // we reference these values a lot
828 //
829 uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs));
830 uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs));
831
832 //
833 // how many kernels will be created?
834 //
835 uint32_t const count_bs = bs_slabs_log2_ru + 1;
836 uint32_t const count_bc = bc_slabs_log2_max + 1;
837 uint32_t count_fm[3] = { 0 };
838 uint32_t count_hm[3] = { 0 };
839
840 // guaranteed to be in range [0,2]
841 for (uint32_t scale = target->config.merge.fm.scale_min;
842 scale <= target->config.merge.fm.scale_max;
843 scale++)
844 {
845 uint32_t fm_left = (target->config.block.slabs / 2) << scale;
846
847 count_fm[scale] = msb_idx_u32(pow2_ru_u32(fm_left)) + 1;
848 }
849
850 // guaranteed to be in range [0,2]
851 for (uint32_t scale = target->config.merge.hm.scale_min;
852 scale <= target->config.merge.hm.scale_max;
853 scale++)
854 {
855 count_hm[scale] = 1;
856 }
857
858 uint32_t const count_bc_fm_hm_transpose =
859 + count_bc
860 + count_fm[0] + count_fm[1] + count_fm[2]
861 + count_hm[0] + count_hm[1] + count_hm[2] +
862 1; // transpose
863
864 uint32_t const count_all = count_bs + count_bc_fm_hm_transpose;
865
866 //
867 // allocate hs_vk
868 //
869 struct hs_vk * hs;
870
871 if (allocator == NULL)
872 {
873 hs = malloc(sizeof(*hs) + sizeof(VkPipeline*) * count_all);
874 }
875 else
876 {
877 hs = allocator->pfnAllocation(NULL,
878 sizeof(*hs) + sizeof(VkPipeline*) * count_all,
879 0,
880 VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
881 }
882
883 // save device & allocator
884 hs->device = device;
885 hs->allocator = allocator;
886
887 //
888 // create one descriptor set layout
889 //
890 static VkDescriptorSetLayoutBinding const dslb_vout_vin[] = {
891 {
892 .binding = 0, // vout
893 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
894 .descriptorCount = 1,
895 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
896 .pImmutableSamplers = NULL
897 },
898 {
899 .binding = 1, // vin
900 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
901 .descriptorCount = 1,
902 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
903 .pImmutableSamplers = NULL
904 }
905 };
906
907 static VkDescriptorSetLayoutCreateInfo const dscli = {
908 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
909 .pNext = NULL,
910 .flags = 0,
911 .bindingCount = 2, // 0:vout[], 1:vin[]
912 .pBindings = dslb_vout_vin
913 };
914
915 vk(CreateDescriptorSetLayout(device,
916 &dscli,
917 allocator,
918 &hs->desc_set.layout.vout_vin));
919
920 //
921 // create one pipeline layout
922 //
923 VkPipelineLayoutCreateInfo plci = {
924 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
925 .pNext = NULL,
926 .flags = 0,
927 .setLayoutCount = 1,
928 .pSetLayouts = &hs->desc_set.layout.vout_vin,
929 .pushConstantRangeCount = 0,
930 .pPushConstantRanges = NULL
931 };
932
933 vk(CreatePipelineLayout(device,
934 &plci,
935 allocator,
936 &hs->pipeline.layout.vout_vin));
937
938 //
939 // copy the config from the target -- we need these values later
940 //
941 memcpy(&hs->config,&target->config,sizeof(hs->config));
942
943 // save some frequently used calculated values
944 hs->key_val_size = (target->config.words.key + target->config.words.val) * 4;
945 hs->slab_keys = target->config.slab.height << target->config.slab.width_log2;
946 hs->bs_slabs_log2_ru = bs_slabs_log2_ru;
947 hs->bc_slabs_log2_max = bc_slabs_log2_max;
948
949 // save kernel count
950 hs->pipelines.count = count_all;
951
952 //
953 // create all the compute pipelines by reusing this info
954 //
955 VkComputePipelineCreateInfo cpci = {
956 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
957 .pNext = NULL,
958 .flags = VK_PIPELINE_CREATE_DISPATCH_BASE, // | VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT,
959 .stage = {
960 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
961 .pNext = NULL,
962 .flags = 0,
963 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
964 .module = VK_NULL_HANDLE,
965 .pName = "main",
966 .pSpecializationInfo = NULL
967 },
968 .layout = hs->pipeline.layout.vout_vin,
969 .basePipelineHandle = VK_NULL_HANDLE,
970 .basePipelineIndex = 0
971 };
972
973 //
974 // Create a shader module, use it to create a pipeline... and
975 // dispose of the shader module.
976 //
977 // The BS compute shaders have the same layout
978 // The non-BS compute shaders have the same layout
979 //
980 VkShaderModuleCreateInfo smci = {
981 .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
982 .pNext = NULL,
983 .flags = 0,
984 .codeSize = 0,
985 .pCode = (uint32_t const *)target->modules // FIXME -- unfortunate typecast
986 };
987
988 //
989 // bs kernels have layout: (vout,vin)
990 // remaining have layout: (vout)
991 //
992 for (uint32_t ii=0; ii<count_all; ii++)
993 {
994 // convert bytes to words
995 uint32_t const * const module = smci.pCode + smci.codeSize / sizeof(*module);
996
997 smci.codeSize = NTOHL_MACRO(module[0]);
998 smci.pCode = module + 1;
999
1000 vk(CreateShaderModule(device,
1001 &smci,
1002 allocator,
1003 &cpci.stage.module));
1004
1005 vk(CreateComputePipelines(device,
1006 pipeline_cache,
1007 1,
1008 &cpci,
1009 allocator,
1010 hs->pipelines.all+ii));
1011
1012 vkDestroyShaderModule(device,
1013 cpci.stage.module,
1014 allocator);
1015 }
1016
1017 //
1018 // initialize pointers to pipeline handles
1019 //
1020 VkPipeline * pipeline_next = hs->pipelines.all;
1021
1022 // BS
1023 hs->pipelines.bs = pipeline_next;
1024 pipeline_next += count_bs;
1025
1026 // BC
1027 hs->pipelines.bc = pipeline_next;
1028 pipeline_next += count_bc;
1029
1030 // FM[0]
1031 hs->pipelines.fm[0] = count_fm[0] ? pipeline_next : NULL;
1032 pipeline_next += count_fm[0];
1033
1034 // FM[1]
1035 hs->pipelines.fm[1] = count_fm[1] ? pipeline_next : NULL;
1036 pipeline_next += count_fm[1];
1037
1038 // FM[2]
1039 hs->pipelines.fm[2] = count_fm[2] ? pipeline_next : NULL;
1040 pipeline_next += count_fm[2];
1041
1042 // HM[0]
1043 hs->pipelines.hm[0] = count_hm[0] ? pipeline_next : NULL;
1044 pipeline_next += count_hm[0];
1045
1046 // HM[1]
1047 hs->pipelines.hm[1] = count_hm[1] ? pipeline_next : NULL;
1048 pipeline_next += count_hm[1];
1049
1050 // HM[2]
1051 hs->pipelines.hm[2] = count_hm[2] ? pipeline_next : NULL;
1052 pipeline_next += count_hm[2];
1053
1054 // TRANSPOSE
1055 hs->pipelines.transpose = pipeline_next;
1056 pipeline_next += 1;
1057
1058 //
1059 // optionally dump pipeline stats
1060 //
1061 #ifdef HS_VK_VERBOSE_STATISTICS_AMD
1062 hs_vk_verbose_statistics_amd(device,hs);
1063 #endif
1064 #ifdef HS_VK_VERBOSE_DISASSEMBLY_AMD
1065 hs_vk_verbose_disassembly_amd(device,hs);
1066 #endif
1067
1068 //
1069 //
1070 //
1071
1072 return hs;
1073 }
1074
1075 //
1076 //
1077 //
1078
1079 void
hs_vk_release(struct hs_vk * const hs)1080 hs_vk_release(struct hs_vk * const hs)
1081 {
1082 vkDestroyDescriptorSetLayout(hs->device,
1083 hs->desc_set.layout.vout_vin,
1084 hs->allocator);
1085
1086 vkDestroyPipelineLayout(hs->device,
1087 hs->pipeline.layout.vout_vin,
1088 hs->allocator);
1089
1090 for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
1091 {
1092 vkDestroyPipeline(hs->device,
1093 hs->pipelines.all[ii],
1094 hs->allocator);
1095 }
1096
1097 if (hs->allocator == NULL)
1098 {
1099 free(hs);
1100 }
1101 else
1102 {
1103 hs->allocator->pfnFree(NULL,hs);
1104 }
1105 }
1106
1107 //
1108 // Allocate a per-thread descriptor set for the vin and vout
1109 // VkBuffers. Note that HotSort uses only one descriptor set.
1110 //
1111
1112 VkDescriptorSet
hs_vk_ds_alloc(struct hs_vk const * const hs,VkDescriptorPool desc_pool)1113 hs_vk_ds_alloc(struct hs_vk const * const hs, VkDescriptorPool desc_pool)
1114 {
1115 VkDescriptorSetAllocateInfo const ds_alloc_info = {
1116 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
1117 .pNext = NULL,
1118 .descriptorPool = desc_pool,
1119 .descriptorSetCount = 1,
1120 .pSetLayouts = &hs->desc_set.layout.vout_vin
1121 };
1122
1123 VkDescriptorSet hs_ds;
1124
1125 vk(AllocateDescriptorSets(hs->device,
1126 &ds_alloc_info,
1127 &hs_ds));
1128
1129 return hs_ds;
1130 }
1131
1132 //
1133 //
1134 //
1135
1136 void
hs_vk_pad(struct hs_vk const * const hs,uint32_t const count,uint32_t * const count_padded_in,uint32_t * const count_padded_out)1137 hs_vk_pad(struct hs_vk const * const hs,
1138 uint32_t const count,
1139 uint32_t * const count_padded_in,
1140 uint32_t * const count_padded_out)
1141 {
1142 //
1143 // round up the count to slabs
1144 //
1145 uint32_t const slabs_ru = (count + hs->slab_keys - 1) / hs->slab_keys;
1146 uint32_t const blocks = slabs_ru / hs->config.block.slabs;
1147 uint32_t const block_slabs = blocks * hs->config.block.slabs;
1148 uint32_t const slabs_ru_rem = slabs_ru - block_slabs;
1149 uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs);
1150
1151 *count_padded_in = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys;
1152 *count_padded_out = *count_padded_in;
1153
1154 //
1155 // will merging be required?
1156 //
1157 if (slabs_ru > hs->config.block.slabs)
1158 {
1159 // more than one block
1160 uint32_t const blocks_lo = pow2_rd_u32(blocks);
1161 uint32_t const block_slabs_lo = blocks_lo * hs->config.block.slabs;
1162 uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo;
1163
1164 if (block_slabs_rem > 0)
1165 {
1166 uint32_t const block_slabs_rem_ru = pow2_ru_u32(block_slabs_rem);
1167
1168 uint32_t const block_slabs_hi = MAX_MACRO(block_slabs_rem_ru,
1169 blocks_lo << (1 - hs->config.merge.fm.scale_min));
1170
1171 uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi,
1172 block_slabs_lo*2); // clamp non-pow2 blocks
1173
1174 *count_padded_out = block_slabs_padded_out * hs->slab_keys;
1175 }
1176 }
1177 }
1178
1179 //
1180 //
1181 //
1182