1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9 //
10 //
11 //
12
13 #include <stdlib.h>
14 #include <stdio.h>
15 #include <string.h>
16 #include <inttypes.h>
17
18 //
19 //
20
21 #include "common/macros.h"
22 #include "common/vk/assert_vk.h"
23 #include "common/vk/host_alloc.h"
24 #include "common/vk/cache_vk.h"
25
26 //
27 //
28 //
29
30 #include "hs_vk.h"
31
32 //
33 // Compile-time images of HotSort targets
34 //
35
36 #include "hs/vk/intel/gen8/u32/hs_target.h"
37 #include "hs/vk/intel/gen8/u64/hs_target.h"
38
39 #include "hs/vk/nvidia/sm_35/u32/hs_target.h"
40 #include "hs/vk/nvidia/sm_35/u64/hs_target.h"
41
42 #include "hs/vk/amd/gcn/u32/hs_target.h"
43 #include "hs/vk/amd/gcn/u64/hs_target.h"
44
45 //
46 //
47 //
48
49 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
50 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);
51
52 //
53 //
54 //
55
56 static
57 char const *
hs_cpu_sort(void * sorted_h,uint32_t const hs_words,uint32_t const count,double * const cpu_ns)58 hs_cpu_sort(void * sorted_h,
59 uint32_t const hs_words,
60 uint32_t const count,
61 double * const cpu_ns)
62 {
63 if (hs_words == 1)
64 return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
65 else
66 return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
67 }
68
69 static
70 void
hs_transpose_slabs_u32(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint32_t * vout_h,uint32_t const count)71 hs_transpose_slabs_u32(uint32_t const hs_words,
72 uint32_t const hs_width,
73 uint32_t const hs_height,
74 uint32_t * vout_h,
75 uint32_t const count)
76 {
77 uint32_t const slab_keys = hs_width * hs_height;
78 size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
79 uint32_t * const slab = ALLOCA_MACRO(slab_size);
80 uint32_t slab_count = count / slab_keys;
81
82 while (slab_count-- > 0)
83 {
84 memcpy(slab,vout_h,slab_size);
85
86 for (uint32_t row=0; row<hs_height; row++)
87 for (uint32_t col=0; col<hs_width; col++)
88 vout_h[col * hs_height + row] = slab[row * hs_width + col];
89
90 vout_h += slab_keys;
91 }
92 }
93
94 static
95 void
hs_transpose_slabs_u64(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint64_t * vout_h,uint32_t const count)96 hs_transpose_slabs_u64(uint32_t const hs_words,
97 uint32_t const hs_width,
98 uint32_t const hs_height,
99 uint64_t * vout_h,
100 uint32_t const count)
101 {
102 uint32_t const slab_keys = hs_width * hs_height;
103 size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
104 uint64_t * const slab = ALLOCA_MACRO(slab_size);
105 uint32_t slab_count = count / slab_keys;
106
107 while (slab_count-- > 0)
108 {
109 memcpy(slab,vout_h,slab_size);
110
111 for (uint32_t row=0; row<hs_height; row++)
112 for (uint32_t col=0; col<hs_width; col++)
113 vout_h[col * hs_height + row] = slab[row * hs_width + col];
114
115 vout_h += slab_keys;
116 }
117 }
118
119 static
120 void
hs_transpose_slabs(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,void * vout_h,uint32_t const count)121 hs_transpose_slabs(uint32_t const hs_words,
122 uint32_t const hs_width,
123 uint32_t const hs_height,
124 void * vout_h,
125 uint32_t const count)
126 {
127 if (hs_words == 1)
128 hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
129 else
130 hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
131 }
132
133 //
134 //
135 //
136
137 #ifndef NDEBUG
138
139 static
140 VkBool32
141 VKAPI_PTR
vk_debug_report_cb(VkDebugReportFlagsEXT flags,VkDebugReportObjectTypeEXT objectType,uint64_t object,size_t location,int32_t messageCode,const char * pLayerPrefix,const char * pMessage,void * pUserData)142 vk_debug_report_cb(VkDebugReportFlagsEXT flags,
143 VkDebugReportObjectTypeEXT objectType,
144 uint64_t object,
145 size_t location,
146 int32_t messageCode,
147 const char* pLayerPrefix,
148 const char* pMessage,
149 void* pUserData)
150 {
151 char const * flag_str = "";
152 bool is_error = false;
153
154 #define VK_FLAG_CASE_TO_STRING(c) \
155 case c: \
156 flag_str = #c; \
157 is_error = true; \
158 break
159
160 switch (flags)
161 {
162 // VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_INFORMATION_BIT_EXT);
163 VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_WARNING_BIT_EXT);
164 VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT);
165 VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_ERROR_BIT_EXT);
166 VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_DEBUG_BIT_EXT);
167 }
168
169 if (is_error)
170 {
171 fprintf(stderr,"%s %s %s\n",
172 flag_str,
173 pLayerPrefix,
174 pMessage);
175 }
176
177 return VK_FALSE;
178 }
179
180 #endif
181
182 //
183 //
184 //
185
186 static
187 uint32_t
hs_rand_u32()188 hs_rand_u32()
189 {
190 static uint32_t seed = 0xDEADBEEF;
191
192 // Numerical Recipes
193 seed = seed * 1664525 + 1013904223;
194
195 return seed;
196 }
197
198 //
199 //
200 //
201
202 static
203 void
hs_fill_rand(uint32_t * vin_h,uint32_t const count,uint32_t const words)204 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
205 {
206 #if 1
207 for (uint32_t ii=0; ii<count*words; ii++)
208 vin_h[ii] = hs_rand_u32();
209 #elif 0 // in-order
210 memset(vin_h,0,count*words*sizeof(uint32_t));
211 for (uint32_t ii=0; ii<count; ii++)
212 vin_h[ii*words] = ii;
213 #else // reverse order
214 memset(vin_h,0,count*words*sizeof(uint32_t));
215 for (uint32_t ii=0; ii<count; ii++)
216 vin_h[ii*words] = count - 1 - ii;
217 #endif
218 }
219
220
221 //
222 //
223 //
224
225 static
226 void
hs_debug_u32(uint32_t const hs_width,uint32_t const hs_height,uint32_t const * vout_h,uint32_t const count)227 hs_debug_u32(uint32_t const hs_width,
228 uint32_t const hs_height,
229 uint32_t const * vout_h,
230 uint32_t const count)
231 {
232 uint32_t const slab_keys = hs_width * hs_height;
233 uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
234
235 for (uint32_t ss=0; ss<slabs; ss++) {
236 fprintf(stderr,"%u\n",ss);
237 for (uint32_t cc=0; cc<hs_height; cc++) {
238 for (uint32_t rr=0; rr<hs_width; rr++)
239 fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
240 fprintf(stderr,"\n");
241 }
242 }
243 }
244
245 static
246 void
hs_debug_u64(uint32_t const hs_width,uint32_t const hs_height,uint64_t const * vout_h,uint32_t const count)247 hs_debug_u64(uint32_t const hs_width,
248 uint32_t const hs_height,
249 uint64_t const * vout_h,
250 uint32_t const count)
251 {
252 uint32_t const slab_keys = hs_width * hs_height;
253 uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
254
255 for (uint32_t ss=0; ss<slabs; ss++) {
256 fprintf(stderr,"%u\n",ss);
257 for (uint32_t cc=0; cc<hs_height; cc++) {
258 for (uint32_t rr=0; rr<hs_width; rr++)
259 fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
260 fprintf(stderr,"\n");
261 }
262 }
263 }
264
265 //
266 //
267 //
268
269 bool
is_matching_device(VkPhysicalDeviceProperties const * const phy_device_props,struct hs_vk_target const ** const hs_target,uint32_t const vendor_id,uint32_t const device_id,uint32_t const key_val_words)270 is_matching_device(VkPhysicalDeviceProperties const * const phy_device_props,
271 struct hs_vk_target const * * const hs_target,
272 uint32_t const vendor_id,
273 uint32_t const device_id,
274 uint32_t const key_val_words)
275 {
276 if ((phy_device_props->vendorID != vendor_id) || (phy_device_props->deviceID != device_id))
277 return false;
278
279 if (phy_device_props->vendorID == 0x10DE)
280 {
281 //
282 // FIXME -- for now, the kernels in this app are targeting
283 // sm_35+ devices. You could add some rigorous rejection by
284 // device id here...
285 //
286 if (key_val_words == 1)
287 *hs_target = &hs_nvidia_sm35_u32;
288 else
289 *hs_target = &hs_nvidia_sm35_u64;
290 }
291 else if (phy_device_props->vendorID == 0x8086)
292 {
293 //
294 // FIXME -- for now, the kernels in this app are targeting GEN8+
295 // devices -- this does *not* include variants of GEN9LP+
296 // "Apollo Lake" because that device has a different
297 // architectural "shape" than GEN8 GTx. You could add some
298 // rigorous rejection by device id here...
299 //
300 if (key_val_words == 1)
301 *hs_target = &hs_intel_gen8_u32;
302 else
303 *hs_target = &hs_intel_gen8_u64;
304 }
305 else if (phy_device_props->vendorID == 0x1002)
306 {
307 //
308 // AMD GCN
309 //
310 if (key_val_words == 1)
311 *hs_target = &hs_amd_gcn_u32;
312 else
313 *hs_target = &hs_amd_gcn_u64;
314 }
315 else
316 {
317 return false;
318 }
319
320 return true;
321 }
322
323 //
324 //
325 //
326
327 uint32_t
vk_find_mem_type_idx(VkPhysicalDeviceMemoryProperties const * phy_device_mem_props,uint32_t const compatible_mem_types,VkMemoryPropertyFlags const required_mem_props,bool const abort)328 vk_find_mem_type_idx(VkPhysicalDeviceMemoryProperties const * phy_device_mem_props,
329 uint32_t const compatible_mem_types,
330 VkMemoryPropertyFlags const required_mem_props,
331 bool const abort)
332 {
333 //
334 // FIXME -- jump between indices in the memoryTypeBits mask
335 //
336 uint32_t const count = phy_device_mem_props->memoryTypeCount;
337
338 for (uint32_t index=0; index<count; index++)
339 {
340 // acceptable memory type for this resource?
341 if ((compatible_mem_types & (1<<index)) == 0)
342 continue;
343
344 // otherwise, find first match...
345 VkMemoryPropertyFlags const common_props =
346 phy_device_mem_props->memoryTypes[index].propertyFlags & required_mem_props;
347
348 if (common_props == required_mem_props)
349 return index;
350 }
351
352 if (abort)
353 {
354 fprintf(stderr,"Memory type not found: %X\n",required_mem_props);
355 exit(EXIT_FAILURE);
356 }
357
358 return UINT32_MAX;
359 }
360
361 //
362 //
363 //
364
365 #ifdef NDEBUG
366 #define HS_BENCH_LOOPS 100
367 #define HS_BENCH_WARMUP 100
368 #else
369 #define HS_BENCH_LOOPS 1
370 #define HS_BENCH_WARMUP 0
371 #endif
372
373 //
374 //
375 //
376
377 int
main(int argc,char const * argv[])378 main(int argc, char const * argv[])
379 {
380 //
381 // select the target by vendor and device id
382 //
383 uint32_t const vendor_id = (argc <= 1) ? UINT32_MAX : strtoul(argv[1],NULL,16);
384 uint32_t const device_id = (argc <= 2) ? UINT32_MAX : strtoul(argv[2],NULL,16);
385 uint32_t const key_val_words = (argc <= 3) ? 1 : strtoul(argv[3],NULL,0);
386
387 if ((key_val_words != 1) && (key_val_words != 2))
388 {
389 fprintf(stderr,"Key/Val Words must be 1 or 2\n");
390 exit(EXIT_FAILURE);
391 }
392
393 //
394 // create a Vulkan instances
395 //
396 VkApplicationInfo const app_info = {
397 .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
398 .pNext = NULL,
399 .pApplicationName = "Google HotSort Bench",
400 .applicationVersion = 0,
401 .pEngineName = "Google HotSort Gen",
402 .engineVersion = 0,
403 .apiVersion = VK_API_VERSION_1_1
404 };
405
406 char const * const instance_enabled_layers[] = {
407 "VK_LAYER_LUNARG_standard_validation"
408 };
409
410 char const * const instance_enabled_extensions[] = {
411 VK_EXT_DEBUG_REPORT_EXTENSION_NAME
412 };
413
414 uint32_t const instance_enabled_layer_count =
415 #ifndef NDEBUG
416 ARRAY_LENGTH_MACRO(instance_enabled_layers)
417 #else
418 0
419 #endif
420 ;
421
422 uint32_t const instance_enabled_extension_count =
423 #ifndef NDEBUG
424 ARRAY_LENGTH_MACRO(instance_enabled_extensions)
425 #else
426 0
427 #endif
428 ;
429
430 VkInstanceCreateInfo const instance_info = {
431 .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
432 .pNext = NULL,
433 .flags = 0,
434 .pApplicationInfo = &app_info,
435 .enabledLayerCount = instance_enabled_layer_count,
436 .ppEnabledLayerNames = instance_enabled_layers,
437 .enabledExtensionCount = instance_enabled_extension_count,
438 .ppEnabledExtensionNames = instance_enabled_extensions
439 };
440
441 VkInstance instance;
442
443 vk(CreateInstance(&instance_info,NULL,&instance));
444
445 //
446 //
447 //
448 #ifndef NDEBUG
449 PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT =
450 (PFN_vkCreateDebugReportCallbackEXT)
451 vkGetInstanceProcAddr(instance,"vkCreateDebugReportCallbackEXT");
452
453 PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT =
454 (PFN_vkDestroyDebugReportCallbackEXT)
455 vkGetInstanceProcAddr(instance,"vkDestroyDebugReportCallbackEXT");
456
457 struct VkDebugReportCallbackCreateInfoEXT const drcci = {
458 .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
459 .pNext = NULL,
460 .flags = UINT32_MAX, // enable everything for now
461 .pfnCallback = vk_debug_report_cb,
462 .pUserData = NULL
463 };
464
465 VkDebugReportCallbackEXT drc;
466
467 vk(CreateDebugReportCallbackEXT(instance,
468 &drcci,
469 NULL,
470 &drc));
471 #endif
472
473 //
474 // acquire all physical devices and select a match
475 //
476 uint32_t phy_device_count;
477
478 vk(EnumeratePhysicalDevices(instance,
479 &phy_device_count,
480 NULL));
481
482 VkPhysicalDevice * phy_devices = vk_host_alloc(NULL,phy_device_count * sizeof(*phy_devices));
483
484 vk(EnumeratePhysicalDevices(instance,
485 &phy_device_count,
486 phy_devices));
487
488 VkPhysicalDevice phy_device = VK_NULL_HANDLE;
489 VkPhysicalDeviceProperties phy_device_props;
490
491 struct hs_vk_target const * hs_target;
492
493 for (uint32_t ii=0; ii<phy_device_count; ii++)
494 {
495 VkPhysicalDeviceProperties tmp;
496
497 vkGetPhysicalDeviceProperties(phy_devices[ii],&tmp);
498
499 bool const is_match = is_matching_device(&tmp,
500 &hs_target,
501 vendor_id,
502 device_id,
503 key_val_words);
504
505 fprintf(stdout,"%c %4X : %4X : %s\n",
506 is_match ? '*' : ' ',
507 tmp.vendorID,
508 tmp.deviceID,
509 tmp.deviceName);
510
511 if (is_match)
512 {
513 phy_device = phy_devices[ii];
514 memcpy(&phy_device_props,&tmp,sizeof(tmp));
515 }
516
517 }
518
519 if (phy_device == VK_NULL_HANDLE)
520 {
521 fprintf(stderr,"Device %4X:%4X not found.\n",
522 vendor_id & 0xFFFF,
523 device_id & 0xFFFF);
524
525 return EXIT_FAILURE;
526 }
527
528 vk_host_free(NULL,phy_devices);
529
530 //
531 // Get rest of command line
532 //
533 uint32_t const slab_size = hs_target->config.slab.height << hs_target->config.slab.width_log2;
534
535 uint32_t const count_lo = (argc <= 4) ? slab_size : strtoul(argv[ 4],NULL,0);
536 uint32_t const count_hi = (argc <= 5) ? count_lo : strtoul(argv[ 5],NULL,0);
537 uint32_t const count_step = (argc <= 6) ? count_lo : strtoul(argv[ 6],NULL,0);
538 uint32_t const loops = (argc <= 7) ? HS_BENCH_LOOPS : strtoul(argv[ 7],NULL,0);
539 uint32_t const warmup = (argc <= 8) ? HS_BENCH_WARMUP : strtoul(argv[ 8],NULL,0);
540 bool const linearize = (argc <= 9) ? true : strtoul(argv[ 9],NULL,0) != 0;
541 bool const verify = (argc <= 10) ? true : strtoul(argv[10],NULL,0) != 0;
542
543 //
544 // get the physical device's memory props
545 //
546 VkPhysicalDeviceMemoryProperties phy_device_mem_props;
547
548 vkGetPhysicalDeviceMemoryProperties(phy_device,&phy_device_mem_props);
549
550 //
551 // get queue properties
552 //
553 VkQueueFamilyProperties queue_fam_props[2];
554 uint32_t queue_fam_count = ARRAY_LENGTH_MACRO(queue_fam_props);
555
556 vkGetPhysicalDeviceQueueFamilyProperties(phy_device,&queue_fam_count,queue_fam_props);
557
558 //
559 // create device
560 //
561 float const queue_priorities[] = { 1.0f };
562
563 VkDeviceQueueCreateInfo const queue_info = {
564 .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
565 .pNext = NULL,
566 .flags = 0,
567 .queueFamilyIndex = 0,
568 .queueCount = 1,
569 .pQueuePriorities = queue_priorities
570 };
571
572 //
573 // clumsily enable AMD GCN shader info extension
574 //
575 char const * const device_enabled_extensions[] = {
576 #if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD )
577 VK_AMD_SHADER_INFO_EXTENSION_NAME
578 #endif
579 };
580
581 uint32_t device_enabled_extension_count = 0;
582
583 #if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD )
584 if (phy_device_props.vendorID == 0x1002)
585 device_enabled_extension_count = 1;
586 #endif
587
588 //
589 //
590 //
591 VkPhysicalDeviceFeatures device_features = { false };
592
593 if (key_val_words == 2)
594 {
595 device_features.shaderInt64 = true;
596 }
597
598 VkDeviceCreateInfo const device_info = {
599 .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
600 .pNext = NULL,
601 .flags = 0,
602 .queueCreateInfoCount = 1,
603 .pQueueCreateInfos = &queue_info,
604 .enabledLayerCount = 0,
605 .ppEnabledLayerNames = NULL,
606 .enabledExtensionCount = device_enabled_extension_count,
607 .ppEnabledExtensionNames = device_enabled_extensions,
608 .pEnabledFeatures = &device_features
609 };
610
611 VkDevice device;
612
613 vk(CreateDevice(phy_device,&device_info,NULL,&device));
614
615 //
616 // get a queue
617 //
618 VkQueue queue;
619
620 vkGetDeviceQueue(device,0,0,&queue);
621
622 //
623 // get the pipeline cache
624 //
625 VkPipelineCache pipeline_cache;
626
627 vk_pipeline_cache_create(device,NULL,".vk_cache",&pipeline_cache);
628
629 //
630 // create a descriptor set pool
631 //
632 VkDescriptorPoolSize const dps[] = {
633 {
634 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
635 .descriptorCount = 2
636 }
637 };
638
639 VkDescriptorPoolCreateInfo const dpci = {
640 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
641 .pNext = NULL,
642 .flags = 0, // VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
643 .maxSets = 1,
644 .poolSizeCount = ARRAY_LENGTH_MACRO(dps),
645 .pPoolSizes = dps
646 };
647
648 VkDescriptorPool desc_pool;
649
650 vk(CreateDescriptorPool(device,
651 &dpci,
652 NULL, // allocator
653 &desc_pool));
654
655 //
656 // create HotSort device instance
657 //
658 struct hs_vk * hs = hs_vk_create(hs_target,
659 device,
660 NULL,
661 pipeline_cache);
662 //
663 // create a HotSort descriptor set for this thread
664 //
665 VkDescriptorSet hs_ds = hs_vk_ds_alloc(hs,desc_pool);
666
667 //
668 // create a command pool for this thread
669 //
670 VkCommandPoolCreateInfo const cmd_pool_info = {
671 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
672 .pNext = NULL,
673 .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
674 .queueFamilyIndex = 0,
675 };
676
677 VkCommandPool cmd_pool;
678
679 vk(CreateCommandPool(device,
680 &cmd_pool_info,
681 NULL,
682 &cmd_pool));
683
684 //
685 // create a query pool for benchmarking
686 //
687 static VkQueryPoolCreateInfo const query_pool_info = {
688 .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
689 .pNext = NULL,
690 .flags = 0,
691 .queryType = VK_QUERY_TYPE_TIMESTAMP,
692 .queryCount = 4,
693 .pipelineStatistics = 0
694 };
695
696 VkQueryPool query_pool;
697
698 vk(CreateQueryPool(device,
699 &query_pool_info,
700 NULL,
701 &query_pool));
702
703 //
704 // create two big buffers -- buffer_out_count is always the largest
705 //
706 uint32_t buffer_in_count, buffer_out_count;
707
708 hs_vk_pad(hs,count_hi,&buffer_in_count,&buffer_out_count);
709
710 size_t const buffer_out_size = buffer_out_count * key_val_words * sizeof(uint32_t);
711
712 VkBufferCreateInfo bci = {
713 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
714 .pNext = NULL,
715 .flags = 0,
716 .size = buffer_out_size,
717 .usage = 0,
718 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
719 .queueFamilyIndexCount = 0,
720 .pQueueFamilyIndices = NULL
721 };
722
723 VkBuffer vin, vout, sorted, rand;
724
725 bci.usage =
726 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
727 VK_BUFFER_USAGE_TRANSFER_DST_BIT,
728
729 vk(CreateBuffer(device,
730 &bci,
731 NULL,
732 &vin));
733
734 vk(CreateBuffer(device,
735 &bci,
736 NULL,
737 &sorted));
738
739 bci.usage =
740 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
741 VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
742 VK_BUFFER_USAGE_TRANSFER_DST_BIT;
743
744 vk(CreateBuffer(device,
745 &bci,
746 NULL,
747 &vout));
748
749 bci.usage =
750 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
751 VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
752
753 vk(CreateBuffer(device,
754 &bci,
755 NULL,
756 &rand));
757
758 //
759 // get memory requirements for one of the buffers
760 //
761 VkMemoryRequirements mr_vin, mr_vout, mr_sorted, mr_rand;
762
763 vkGetBufferMemoryRequirements(device,vin, &mr_vin);
764 vkGetBufferMemoryRequirements(device,vout,&mr_vout);
765
766 vkGetBufferMemoryRequirements(device,rand,&mr_sorted);
767 vkGetBufferMemoryRequirements(device,rand,&mr_rand);
768
769 //
770 // allocate memory for the buffers
771 //
772 // for simplicity, all buffers are the same size
773 //
774 // vin and vout have the same usage
775 //
776 VkMemoryAllocateInfo const mai_vin_vout = {
777 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
778 .pNext = NULL,
779 .allocationSize = mr_vin.size,
780 .memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props,
781 mr_vin.memoryTypeBits,
782 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
783 true)
784 };
785
786 VkMemoryAllocateInfo const mai_sorted_rand = {
787 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
788 .pNext = NULL,
789 .allocationSize = mr_sorted.size,
790 .memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props,
791 mr_sorted.memoryTypeBits,
792 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
793 VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
794 true)
795 };
796
797 VkDeviceMemory mem_vin, mem_vout, mem_sorted, mem_rand;
798
799 vk(AllocateMemory(device,
800 &mai_vin_vout,
801 NULL,
802 &mem_vin));
803
804 vk(AllocateMemory(device,
805 &mai_vin_vout,
806 NULL,
807 &mem_vout));
808
809 vk(AllocateMemory(device,
810 &mai_sorted_rand,
811 NULL,
812 &mem_sorted));
813
814 vk(AllocateMemory(device,
815 &mai_sorted_rand,
816 NULL,
817 &mem_rand));
818
819 //
820 // bind backing memory to the virtual allocations
821 //
822 vk(BindBufferMemory(device,vin, mem_vin, 0));
823 vk(BindBufferMemory(device,vout, mem_vout, 0));
824
825 vk(BindBufferMemory(device,sorted,mem_sorted,0));
826 vk(BindBufferMemory(device,rand, mem_rand, 0));
827
828 //
829 // map and fill the rand buffer with random values
830 //
831 void * rand_h = vk_host_alloc(NULL,buffer_out_size);
832 void * sorted_h = vk_host_alloc(NULL,buffer_out_size);
833
834 hs_fill_rand(rand_h,buffer_out_count,key_val_words);
835
836 void * rand_map;
837
838 vk(MapMemory(device,mem_rand,0,VK_WHOLE_SIZE,0,&rand_map));
839
840 memcpy(rand_map,rand_h,buffer_out_size);
841
842 vkUnmapMemory(device,mem_rand);
843
844 //
845 // create a single command buffer for this thread
846 //
847 VkCommandBufferAllocateInfo const cmd_buffer_info = {
848 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
849 .pNext = NULL,
850 .commandPool = cmd_pool,
851 .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
852 .commandBufferCount = 1
853 };
854
855 VkCommandBuffer cb;
856
857 vk(AllocateCommandBuffers(device,
858 &cmd_buffer_info,
859 &cb));
860
861 //
862 //
863 //
864 static VkCommandBufferBeginInfo const cb_begin_info = {
865 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
866 .pNext = NULL,
867 .flags = 0, // VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
868 .pInheritanceInfo = NULL
869 };
870
871 struct VkSubmitInfo const submit_info = {
872 .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
873 .pNext = NULL,
874 .waitSemaphoreCount = 0,
875 .pWaitSemaphores = NULL,
876 .pWaitDstStageMask = NULL,
877 .commandBufferCount = 1,
878 .pCommandBuffers = &cb,
879 .signalSemaphoreCount = 0,
880 .pSignalSemaphores = NULL
881 };
882
883 //
884 // labels
885 //
886 fprintf(stdout,
887 "Device, "
888 "Driver, "
889 "Type, "
890 "Slab/Linear, "
891 "Verified?, "
892 "Keys, "
893 "Keys Padded In, "
894 "Keys Padded Out, "
895 "CPU, "
896 "Algorithm, "
897 "CPU Msecs, "
898 "CPU Mkeys/s, "
899 "GPU, "
900 "Trials, "
901 "Avg. Msecs, "
902 "Min Msecs, "
903 "Max Msecs, "
904 "Avg. Mkeys/s, "
905 "Max. Mkeys/s\n");
906
907 //
908 // test a range
909 //
910 for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
911 {
912 //
913 // size the vin and vout arrays
914 //
915 uint32_t count_padded_in, count_padded_out;
916
917 hs_vk_pad(hs,count,&count_padded_in,&count_padded_out);
918
919 //
920 // initialize vin with 'count' random keys
921 //
922 vkBeginCommandBuffer(cb,&cb_begin_info);
923
924 VkBufferCopy const copy_rand = {
925 .srcOffset = 0,
926 .dstOffset = 0,
927 .size = count * key_val_words * sizeof(uint32_t)
928 };
929
930 vkCmdCopyBuffer(cb,
931 rand,
932 vin,
933 1,
934 ©_rand);
935
936 vk(EndCommandBuffer(cb));
937
938 vk(QueueSubmit(queue,
939 1,
940 &submit_info,
941 VK_NULL_HANDLE)); // FIXME -- put a fence here
942
943 // wait for queue to drain
944 vk(QueueWaitIdle(queue));
945 vk(ResetCommandBuffer(cb,0));
946
947 //
948 // build the sorting command buffer
949 //
950 vkBeginCommandBuffer(cb,&cb_begin_info);
951
952 //
953 // starting timestamp
954 //
955 vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,query_pool,0);
956
957 //
958 // bind the vin/vout buffers early
959 //
960 hs_vk_ds_bind(hs,hs_ds,cb,vin,vout);
961
962 //
963 // append sorting commands
964 //
965 hs_vk_sort(hs,
966 cb,
967 vin,0,0,
968 vout,0,0,
969 count,
970 count_padded_in,
971 count_padded_out,
972 linearize);
973
974 //
975 // end timestamp
976 //
977 vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,query_pool,1);
978
979 //
980 // end the command buffer
981 //
982 vk(EndCommandBuffer(cb));
983
984 //
985 // measure the min/max/avg execution time
986 //
987 uint64_t elapsed_ns_min = UINT64_MAX;
988 uint64_t elapsed_ns_max = 0;
989 uint64_t elapsed_ns_sum = 0;
990
991 for (uint32_t ii=0; ii<warmup+loops; ii++)
992 {
993 if (ii == warmup)
994 {
995 elapsed_ns_min = UINT64_MAX;
996 elapsed_ns_max = 0;
997 elapsed_ns_sum = 0;
998 }
999
1000 vk(QueueSubmit(queue,
1001 1,
1002 &submit_info,
1003 VK_NULL_HANDLE)); // FIXME -- put a fence here
1004
1005 // wait for queue to drain
1006 vk(QueueWaitIdle(queue));
1007
1008 // get results
1009 uint64_t timestamps[2];
1010
1011 vk(GetQueryPoolResults(device,query_pool,
1012 0,ARRAY_LENGTH_MACRO(timestamps),
1013 sizeof(timestamps),
1014 timestamps,
1015 sizeof(timestamps[0]),
1016 VK_QUERY_RESULT_64_BIT |
1017 VK_QUERY_RESULT_WAIT_BIT));
1018
1019 uint64_t const t = timestamps[1] - timestamps[0];
1020
1021 elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t);
1022 elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t);
1023 elapsed_ns_sum += t;
1024 }
1025
1026 vk(ResetCommandBuffer(cb,0));
1027
1028 //
1029 // copy the results back and, optionally, verify them
1030 //
1031 char const * cpu_algo = NULL;
1032 double cpu_ns = 0.0;
1033 bool verified = false;
1034
1035 if (verify)
1036 {
1037 size_t const size_padded_in = count_padded_in * key_val_words * sizeof(uint32_t);
1038
1039 vkBeginCommandBuffer(cb,&cb_begin_info);
1040
1041 VkBufferCopy const copy_vout = {
1042 .srcOffset = 0,
1043 .dstOffset = 0,
1044 .size = size_padded_in
1045 };
1046
1047 vkCmdCopyBuffer(cb,
1048 vout,
1049 sorted,
1050 1,
1051 ©_vout);
1052
1053 vk(EndCommandBuffer(cb));
1054
1055 vk(QueueSubmit(queue,
1056 1,
1057 &submit_info,
1058 VK_NULL_HANDLE)); // FIXME -- put a fence here
1059
1060 // wait for queue to drain
1061 vk(QueueWaitIdle(queue));
1062 vk(ResetCommandBuffer(cb,0));
1063
1064 size_t const size_sorted_h = count * key_val_words * sizeof(uint32_t);
1065
1066 // copy and sort random data
1067 memcpy(sorted_h,rand_h,size_sorted_h);
1068 memset((uint8_t*)sorted_h + size_sorted_h,-1,size_padded_in-size_sorted_h);
1069
1070 cpu_algo = hs_cpu_sort(sorted_h,key_val_words,count_padded_in,&cpu_ns);
1071
1072 void * sorted_map;
1073
1074 vk(MapMemory(device,mem_sorted,0,VK_WHOLE_SIZE,0,&sorted_map));
1075
1076 if (!linearize) {
1077 hs_transpose_slabs(key_val_words,
1078 1u<<hs_target->config.slab.width_log2,
1079 hs_target->config.slab.height,
1080 sorted_map,
1081 count_padded_in);
1082 }
1083
1084 // verify
1085 verified = memcmp(sorted_h,sorted_map,size_padded_in) == 0;
1086
1087 #ifndef NDEBUG
1088 if (!verified)
1089 {
1090 if (key_val_words == 1)
1091 {
1092 hs_debug_u32(1u<<hs_target->config.slab.width_log2,
1093 hs_target->config.slab.height,
1094 sorted_h,
1095 count);
1096
1097 hs_debug_u32(1u<<hs_target->config.slab.width_log2,
1098 hs_target->config.slab.height,
1099 sorted_map,
1100 count);
1101 }
1102 else // ulong
1103 {
1104 hs_debug_u64(1u<<hs_target->config.slab.width_log2,
1105 hs_target->config.slab.height,
1106 sorted_h,
1107 count);
1108
1109 hs_debug_u64(1u<<hs_target->config.slab.width_log2,
1110 hs_target->config.slab.height,
1111 sorted_map,
1112 count);
1113 }
1114 }
1115 #endif
1116
1117 vkUnmapMemory(device,mem_sorted);
1118 }
1119
1120 //
1121 // REPORT
1122 //
1123 float const timestamp_period = phy_device_props.limits.timestampPeriod;
1124
1125 fprintf(stdout,"%s, %u.%u.%u.%u, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
1126 phy_device_props.deviceName,
1127 (phy_device_props.driverVersion>>24)&0xFF,
1128 (phy_device_props.driverVersion>>16)&0xFF,
1129 (phy_device_props.driverVersion>> 8)&0xFF,
1130 (phy_device_props.driverVersion )&0xFF,
1131 (key_val_words == 1) ? "uint" : "ulong",
1132 linearize ? "linear" : "slab",
1133 verify ? (verified ? " OK " : "*FAIL*") : "UNVERIFIED",
1134 count,
1135 count_padded_in,
1136 count_padded_out,
1137 // CPU
1138 verify ? cpu_algo : "UNVERIFIED",
1139 verify ? (cpu_ns / 1000000.0) : 0.0, // milliseconds
1140 verify ? (1000.0 * count / cpu_ns) : 0.0, // mkeys / sec
1141 // GPU
1142 loops,
1143 timestamp_period * elapsed_ns_sum / 1e6 / loops, // avg msecs
1144 timestamp_period * elapsed_ns_min / 1e6, // min msecs
1145 timestamp_period * elapsed_ns_max / 1e6, // max msecs
1146 1000.0 * count * loops / (timestamp_period * elapsed_ns_sum), // mkeys / sec - avg
1147 1000.0 * count / (timestamp_period * elapsed_ns_min)); // mkeys / sec - max
1148 }
1149
1150 // reset the descriptor pool
1151 vk(ResetDescriptorPool(device,desc_pool,0));
1152
1153 //
1154 // cleanup
1155 //
1156
1157 // release shared HotSort state
1158 hs_vk_release(hs);
1159
1160 // destroy the vin/vout buffers (before device memory)
1161 vkDestroyBuffer(device,vin, NULL);
1162 vkDestroyBuffer(device,vout, NULL);
1163 vkDestroyBuffer(device,sorted,NULL);
1164 vkDestroyBuffer(device,rand, NULL);
1165
1166 // free device memory
1167 vkFreeMemory(device,mem_vin, NULL);
1168 vkFreeMemory(device,mem_vout, NULL);
1169 vkFreeMemory(device,mem_sorted,NULL);
1170 vkFreeMemory(device,mem_rand, NULL);
1171
1172 // free host memory
1173 vk_host_free(NULL,rand_h);
1174 vk_host_free(NULL,sorted_h);
1175
1176 // destroy the descriptor pool
1177 vkDestroyDescriptorPool(device,desc_pool,NULL);
1178
1179 // destroy remaining...
1180 vkDestroyQueryPool(device,query_pool,NULL);
1181 vkFreeCommandBuffers(device,cmd_pool,1,&cb);
1182 vkDestroyCommandPool(device,cmd_pool,NULL);
1183
1184 vk_pipeline_cache_destroy(device,NULL,".vk_cache",pipeline_cache);
1185
1186 vkDestroyDevice(device,NULL);
1187
1188 #ifndef NDEBUG
1189 vkDestroyDebugReportCallbackEXT(instance,drc,NULL);
1190 #endif
1191
1192 vkDestroyInstance(instance,NULL);
1193
1194 return EXIT_SUCCESS;
1195 }
1196
1197 //
1198 //
1199 //
1200