• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can
5  * be found in the LICENSE file.
6  *
7  */
8 
9 //
10 //
11 //
12 
13 #include <stdlib.h>
14 #include <stdio.h>
15 #include <string.h>
16 #include <inttypes.h>
17 
18 //
19 // squelch OpenCL 1.2 deprecation warning
20 //
21 
22 #ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
23 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
24 #endif
25 
26 #include "common/macros.h"
27 #include "common/cl/assert_cl.h"
28 #include "common/cl/find_cl.h"
29 //
30 //
31 //
32 
33 #include "hs_cl.h"
34 
35 //
36 // FIXME -- LIMITED TO INTEL / GEN8+ FOR NOW
37 //
38 
39 #include "intel/gen8/u32/hs_target.h"
40 #include "intel/gen8/u64/hs_target.h"
41 
42 // #include "intel/gen9lp/u32/hs_target.h"
43 // #include "intel/gen9lp/u64/hs_target.h"
44 
45 //
46 // The quality of the RNG doesn't matter.  The same number of
47 // instructions will be run no matter what the key distribution looks
48 // like.  So here is something small and fast.
49 //
50 
51 static
52 uint32_t
hs_rand_u32()53 hs_rand_u32()
54 {
55   static uint32_t seed = 0xDEADBEEF;
56 
57   // Numerical Recipes
58   seed = seed * 1664525 + 1013904223;
59 
60   return seed;
61 }
62 
63 //
64 //
65 //
66 
67 static
68 void
hs_fill_rand(uint32_t * vin_h,uint32_t const count,uint32_t const words)69 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
70 {
71 #if   1
72   for (uint32_t ii=0; ii<count*words; ii++)
73     vin_h[ii] = hs_rand_u32();
74 #elif 0 // in-order
75   memset(vin_h,0,count*words*sizeof(uint32_t));
76   for (uint32_t ii=0; ii<count; ii++)
77     vin_h[ii*words] = ii;
78 #else   // reverse order
79   memset(vin_h,0,count*words*sizeof(uint32_t));
80   for (uint32_t ii=0; ii<count; ii++)
81     vin_h[ii*words] = count - 1 - ii;
82 #endif
83 }
84 
85 //
86 //
87 //
88 
89 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
90 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);
91 
92 //
93 //
94 //
95 
96 static
97 char const *
hs_cpu_sort(void * sorted_h,uint32_t const hs_words,uint32_t const count,double * const cpu_ns)98 hs_cpu_sort(void     *       sorted_h,
99             uint32_t   const hs_words,
100             uint32_t   const count,
101             double   * const cpu_ns)
102 {
103   if (hs_words == 1)
104     return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
105   else
106     return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
107 }
108 
109 static
110 void
hs_transpose_slabs_u32(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint32_t * vout_h,uint32_t const count)111 hs_transpose_slabs_u32(uint32_t const hs_words,
112                        uint32_t const hs_width,
113                        uint32_t const hs_height,
114                        uint32_t *     vout_h,
115                        uint32_t const count)
116 {
117   uint32_t   const slab_keys  = hs_width * hs_height;
118   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
119   uint32_t * const slab       = ALLOCA_MACRO(slab_size);
120   uint32_t         slab_count = count / slab_keys;
121 
122   while (slab_count-- > 0)
123     {
124       memcpy(slab,vout_h,slab_size);
125 
126       for (uint32_t row=0; row<hs_height; row++)
127         for (uint32_t col=0; col<hs_width; col++)
128           vout_h[col * hs_height + row] = slab[row * hs_width + col];
129 
130       vout_h += slab_keys;
131     }
132 }
133 
134 static
135 void
hs_transpose_slabs_u64(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint64_t * vout_h,uint32_t const count)136 hs_transpose_slabs_u64(uint32_t const hs_words,
137                        uint32_t const hs_width,
138                        uint32_t const hs_height,
139                        uint64_t *     vout_h,
140                        uint32_t const count)
141 {
142   uint32_t   const slab_keys  = hs_width * hs_height;
143   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
144   uint64_t * const slab       = ALLOCA_MACRO(slab_size);
145   uint32_t         slab_count = count / slab_keys;
146 
147   while (slab_count-- > 0)
148     {
149       memcpy(slab,vout_h,slab_size);
150 
151       for (uint32_t row=0; row<hs_height; row++)
152         for (uint32_t col=0; col<hs_width; col++)
153           vout_h[col * hs_height + row] = slab[row * hs_width + col];
154 
155       vout_h += slab_keys;
156     }
157 }
158 
159 static
160 void
hs_transpose_slabs(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,void * vout_h,uint32_t const count)161 hs_transpose_slabs(uint32_t const hs_words,
162                    uint32_t const hs_width,
163                    uint32_t const hs_height,
164                    void   *       vout_h,
165                    uint32_t const count)
166 {
167   if (hs_words == 1)
168     hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
169   else
170     hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
171 }
172 
173 //
174 //
175 //
176 
177 static
178 void
hs_debug_u32(uint32_t const hs_width,uint32_t const hs_height,uint32_t const * vout_h,uint32_t const count)179 hs_debug_u32(uint32_t const   hs_width,
180              uint32_t const   hs_height,
181              uint32_t const * vout_h,
182              uint32_t const   count)
183 {
184   uint32_t const slab_keys = hs_width * hs_height;
185   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;
186 
187   for (uint32_t ss=0; ss<slabs; ss++) {
188     fprintf(stderr,"%u\n",ss);
189     for (uint32_t cc=0; cc<hs_height; cc++) {
190       for (uint32_t rr=0; rr<hs_width; rr++)
191         fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
192       fprintf(stderr,"\n");
193     }
194   }
195 }
196 
197 static
198 void
hs_debug_u64(uint32_t const hs_width,uint32_t const hs_height,uint64_t const * vout_h,uint32_t const count)199 hs_debug_u64(uint32_t const   hs_width,
200              uint32_t const   hs_height,
201              uint64_t const * vout_h,
202              uint32_t const   count)
203 {
204   uint32_t const slab_keys = hs_width * hs_height;
205   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;
206 
207   for (uint32_t ss=0; ss<slabs; ss++) {
208     fprintf(stderr,"%u\n",ss);
209     for (uint32_t cc=0; cc<hs_height; cc++) {
210       for (uint32_t rr=0; rr<hs_width; rr++)
211         fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
212       fprintf(stderr,"\n");
213     }
214   }
215 }
216 
217 //
218 // Used for benchmarking on out-of-order queues.  Attaching an event
219 // to a kernel on an OOQ with profiling enabled will result in a
220 // synchronization point and block concurrent execution of kernels.
221 //
222 // The workaround that enables measuring the entire runtime of the
223 // sort is to launch a dummy kernel with an event, a barrier without
224 // an event, then the call to hs_sort(), followed by a final dummy
225 // kernel with an event.
226 //
227 // The end time of the first dummy and start time of the second dummy
228 // will provide a conservative estimate of the total execution time of
229 // the hs_sort() routine.
230 //
231 // Note that once kernels are enqueued they are scheduled with only
232 // microseconds between them so this should only be a small number of
233 // microseconds longer than the true hs_sort() execution time.
234 //
235 
236 #define HS_DUMMY_KERNEL_PROGRAM "kernel void hs_dummy_kernel() { ; }"
237 
238 static cl_kernel hs_dummy_kernel;
239 
240 static
241 void
hs_dummy_kernel_create(cl_context context,cl_device_id device_id)242 hs_dummy_kernel_create(cl_context context, cl_device_id device_id)
243 {
244   cl_int err;
245 
246   char   const * strings[]        = { HS_DUMMY_KERNEL_PROGRAM         };
247   size_t const   strings_sizeof[] = { sizeof(HS_DUMMY_KERNEL_PROGRAM) };
248 
249   cl_program program = clCreateProgramWithSource(context,
250                                                  1,
251                                                  strings,
252                                                  strings_sizeof,
253                                                  &err); cl_ok(err);
254   cl(BuildProgram(program,
255                   1,
256                   &device_id,
257                   NULL,
258                   NULL,
259                   NULL));
260 
261   hs_dummy_kernel = clCreateKernel(program,"hs_dummy_kernel",&err); cl_ok(err);
262 
263   cl(ReleaseProgram(program));
264 }
265 
266 static
267 void
hs_dummy_kernel_release()268 hs_dummy_kernel_release()
269 {
270   cl(ReleaseKernel(hs_dummy_kernel));
271 }
272 
273 static
274 void
hs_dummy_kernel_enqueue(cl_command_queue cq,uint32_t wait_list_size,cl_event const * wait_list,cl_event * event)275 hs_dummy_kernel_enqueue(cl_command_queue cq,
276                         uint32_t         wait_list_size,
277                         cl_event const * wait_list,
278                         cl_event       * event)
279 {
280   size_t const global_work_size = 1;
281 
282   cl(EnqueueNDRangeKernel(cq,
283                           hs_dummy_kernel,
284                           1,
285                           NULL,
286                           &global_work_size,
287                           NULL,
288                           wait_list_size,
289                           wait_list,
290                           event));
291 }
292 
293 //
294 //
295 //
296 
297 static
298 void
hs_bench(cl_context context,cl_command_queue cq,cl_command_queue cq_profile,char const * const device_name,char const * const driver_version,uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,struct hs_cl const * const hs,uint32_t const count_lo,uint32_t const count_hi,uint32_t const count_step,uint32_t const loops,uint32_t const warmup,bool const linearize)299 hs_bench(cl_context                   context,
300          cl_command_queue             cq,
301          cl_command_queue             cq_profile,
302          char           const * const device_name,
303          char           const * const driver_version,
304          uint32_t               const hs_words,
305          uint32_t               const hs_width,
306          uint32_t               const hs_height,
307          struct hs_cl   const * const hs,
308          uint32_t               const count_lo,
309          uint32_t               const count_hi,
310          uint32_t               const count_step,
311          uint32_t               const loops,
312          uint32_t               const warmup,
313          bool                   const linearize)
314 {
315   //
316   // return if nothing to do
317   //
318   if (count_hi <= 1)
319     return;
320 
321   //
322   // size the arrays
323   //
324   uint32_t count_hi_padded_in, count_hi_padded_out;
325 
326   hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out);
327 
328   //
329   // SIZE
330   //
331   size_t const key_size    = sizeof(uint32_t)    * hs_words;
332 
333   size_t const size_hi_in  = count_hi_padded_in  * key_size;
334   size_t const size_hi_out = count_hi_padded_out * key_size;
335 
336   //
337   // ALLOCATE
338   //
339   cl_int cl_err;
340 
341   void * sorted_h = malloc(size_hi_in);
342 
343   cl_mem random   = clCreateBuffer(context,
344                                    CL_MEM_READ_ONLY  | CL_MEM_ALLOC_HOST_PTR,
345                                    size_hi_in,
346                                    NULL,&cl_err); cl_ok(cl_err);
347 
348   cl_mem vin      = clCreateBuffer(context,
349                                    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
350                                    size_hi_in,
351                                    NULL,&cl_err); cl_ok(cl_err);
352 
353   cl_mem vout     = clCreateBuffer(context,
354                                    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
355                                    size_hi_out,
356                                    NULL,&cl_err); cl_ok(cl_err);
357   //
358   // BLOCKING MAP AND INIT KEYS
359   //
360   {
361     void * random_h = clEnqueueMapBuffer(cq,
362                                          random,
363                                          CL_TRUE,
364                                          CL_MAP_WRITE_INVALIDATE_REGION,
365                                          0,size_hi_in,
366                                          0,NULL,NULL,
367                                          &cl_err); cl_ok(cl_err);
368 
369     // fill with random numbers
370     hs_fill_rand(random_h,count_hi,hs_words);
371 
372     //
373     // UNMAP
374     //
375     cl(EnqueueUnmapMemObject(cq,random,random_h,0,NULL,NULL));
376   }
377 
378   //
379   // BENCHMARK
380   //
381   for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
382     {
383       // compute padding before sorting
384       uint32_t count_padded_in, count_padded_out;
385 
386       hs_cl_pad(hs,count,&count_padded_in,&count_padded_out);
387 
388       cl_ulong elapsed_ns_min = UINT64_MAX;
389       cl_ulong elapsed_ns_max = 0;
390       cl_ulong elapsed_ns_sum = 0;
391 
392       cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
393       cl(Finish(cq));
394 
395       for (uint32_t ii=0; ii<warmup+loops; ii++)
396         {
397           if (ii == warmup)
398             {
399               elapsed_ns_min = UINT64_MAX;
400               elapsed_ns_max = 0;
401               elapsed_ns_sum = 0;
402             }
403 
404 #if 0
405           //
406           // optionally, initialize vin on every loop -- no need
407           //
408           cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
409           cl(Finish(cq));
410 #endif
411 
412           //
413           // sort vin
414           //
415           cl_event start, complete, end;
416 
417           hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start);
418 
419           // note hs_sort enqueues a final barrier
420           hs_cl_sort(hs,
421                      cq,
422                      1,&start,&complete,
423                      vin,vout,
424                      count,
425                      count_padded_in,
426                      count_padded_out,
427                      linearize);
428 
429           hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end);
430 
431           cl(Finish(cq_profile));
432 
433           //
434           // measure duration
435           //
436           cl_ulong t_start=0, t_end=0;
437 
438           // start
439           cl(GetEventProfilingInfo(start,
440                                    CL_PROFILING_COMMAND_END,
441                                    sizeof(cl_ulong),
442                                    &t_start,
443                                    NULL));
444 
445           // end
446           cl(GetEventProfilingInfo(end,
447                                    CL_PROFILING_COMMAND_START,
448                                    sizeof(cl_ulong),
449                                    &t_end,
450                                    NULL));
451 
452           cl_ulong const t = t_end - t_start;
453 
454           elapsed_ns_min  = MIN_MACRO(elapsed_ns_min,t);
455           elapsed_ns_max  = MAX_MACRO(elapsed_ns_max,t);
456           elapsed_ns_sum += t;
457 
458           cl(ReleaseEvent(start));
459           cl(ReleaseEvent(complete));
460           cl(ReleaseEvent(end));
461         }
462 
463       //
464       // COPY KEYS BACK FOR VERIFICATION
465       //
466       size_t const size_padded_in = count_padded_in * key_size;
467 
468       void * vin_h = clEnqueueMapBuffer(cq,
469                                         vin,
470                                         CL_FALSE,
471                                         CL_MAP_READ,
472                                         0,size_padded_in,
473                                         0,NULL,NULL,
474                                         &cl_err); cl_ok(cl_err);
475 
476       void * vout_h = clEnqueueMapBuffer(cq,
477                                          vout,
478                                          CL_FALSE,
479                                          CL_MAP_READ,
480                                          0,size_padded_in,
481                                          0,NULL,NULL,
482                                          &cl_err); cl_ok(cl_err);
483       cl(Finish(cq));
484 
485       //
486       // SORT THE UNTOUCHED RANDOM INPUT
487       //
488       memcpy(sorted_h,vin_h,size_padded_in);
489 
490       double cpu_ns;
491 
492       char const * const algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);
493 
494       //
495       // EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING
496       //
497       if (!linearize)
498         hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
499 
500       //
501       // VERIFY
502       //
503       bool const verified = memcmp(sorted_h,vout_h,size_padded_in) == 0;
504 
505 #ifndef NDEBUG
506       if (!verified)
507         {
508           if (hs_words == 1)
509             hs_debug_u32(hs_width,hs_height,vout_h,count);
510           else // ulong
511             hs_debug_u64(hs_width,hs_height,vout_h,count);
512         }
513 #endif
514 
515       cl(EnqueueUnmapMemObject(cq,vin, vin_h, 0,NULL,NULL));
516       cl(EnqueueUnmapMemObject(cq,vout,vout_h,0,NULL,NULL));
517 
518       cl(Finish(cq));
519 
520       //
521       // REPORT
522       //
523       fprintf(stdout,"%s, %s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
524               device_name,
525               driver_version,
526               (hs_words == 1) ? "uint" : "ulong",
527               linearize ? "linear" : "slab",
528               verified ? "  OK  " : "*FAIL*",
529               count,
530               count_padded_in,
531               count_padded_out,
532               // CPU
533               algo,
534               cpu_ns / 1000000.0,                       // milliseconds
535               1000.0 * count / cpu_ns,                  // mkeys / sec
536               // GPU
537               loops,
538               elapsed_ns_sum / 1000000.0 / loops,       // avg msecs
539               elapsed_ns_min / 1000000.0,               // min msecs
540               elapsed_ns_max / 1000000.0,               // max msecs
541               1000.0 * count * loops / elapsed_ns_sum,  // mkeys / sec - avg
542               1000.0 * count         / elapsed_ns_min); // mkeys / sec - max
543 
544       // quit early if not verified
545       if (!verified)
546         break;
547     }
548 
549   //
550   // dispose
551   //
552   cl(ReleaseMemObject(vout));
553   cl(ReleaseMemObject(vin));
554   cl(ReleaseMemObject(random));
555   free(sorted_h);
556 }
557 
558 //
559 //
560 //
561 
562 int
main(int argc,char const * argv[])563 main(int argc, char const * argv[])
564 {
565   char const * const target_platform_substring = "Intel";
566   char const * const target_device_substring   = "Graphics";
567 
568   //
569   // find platform and device ids
570   //
571   cl_platform_id platform_id;
572   cl_device_id   device_id;
573 
574 #define HS_DEVICE_NAME_SIZE  64
575 
576   char   device_name[HS_DEVICE_NAME_SIZE];
577   size_t device_name_size;
578 
579   cl(FindIdsByName(target_platform_substring,
580                    target_device_substring,
581                    &platform_id,
582                    &device_id,
583                    HS_DEVICE_NAME_SIZE,
584                    device_name,
585                    &device_name_size,
586                    true));
587   //
588   // create context
589   //
590   cl_context_properties context_properties[] =
591     {
592       CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id,
593       0
594     };
595 
596   cl_int     cl_err;
597   cl_context context = clCreateContext(context_properties,
598                                        1,
599                                        &device_id,
600                                        NULL,
601                                        NULL,
602                                        &cl_err);
603   cl_ok(cl_err);
604 
605   //
606   // create command queue
607   //
608 #if 0 // OPENCL 2.0
609 
610   cl_queue_properties props[] = {
611     CL_QUEUE_PROPERTIES,
612     (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
613 #ifndef NDEBUG
614     (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
615 #endif
616     0
617   };
618 
619   cl_queue_properties props_profile[] = {
620     CL_QUEUE_PROPERTIES,
621     (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
622     0
623   };
624 
625   cl_command_queue cq = clCreateCommandQueueWithProperties(context,
626                                                            device_id,
627                                                            props,
628                                                            &cl_err); cl_ok(cl_err);
629 
630   cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context,
631                                                                    device_id,
632                                                                    props_profile,
633                                                                    &cl_err); cl_ok(cl_err);
634 #else // OPENCL 1.2
635 
636   cl_command_queue cq = clCreateCommandQueue(context,
637                                              device_id,
638 #ifndef NDEBUG
639                                              CL_QUEUE_PROFILING_ENABLE |
640 #endif
641                                              CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
642                                              &cl_err); cl_ok(cl_err);
643 
644   cl_command_queue cq_profile = clCreateCommandQueue(context,
645                                                      device_id,
646                                                      CL_QUEUE_PROFILING_ENABLE,
647                                                      &cl_err); cl_ok(cl_err);
648 #endif
649 
650   //
651   // Intel GEN workaround -- create dummy kernel for semi-accurate
652   // profiling on an out-of-order queue.
653   //
654   hs_dummy_kernel_create(context,device_id);
655 
656   //
657   // select the target
658   //
659 
660   uint32_t const key_val_words = (argc == 1) ? 2 : strtoul(argv[1],NULL,0);
661 
662   struct hs_cl_target const * hs_target;
663 
664   if (key_val_words == 1)
665     hs_target = &hs_intel_gen8_u32;
666   else
667     hs_target = &hs_intel_gen8_u64;
668 
669   //
670   // create kernels
671   //
672   fprintf(stdout,"Creating... ");
673 
674   struct hs_cl * const hs = hs_cl_create(hs_target,context,device_id);
675 
676   fprintf(stdout,"done.\n");
677 
678   //
679   //
680   //
681 
682 #ifdef NDEBUG
683 #define HS_BENCH_LOOPS   100
684 #define HS_BENCH_WARMUP  100
685 #else
686 #define HS_BENCH_LOOPS   1
687 #define HS_BENCH_WARMUP  0
688 #endif
689 
690   //
691   // sort sizes and loops
692   //
693   uint32_t const kpb        = hs_target->config.slab.height << hs_target->config.slab.width_log2;
694 
695   uint32_t const count_lo   = (argc <= 2) ? kpb             : strtoul(argv[2],NULL,0);
696   uint32_t const count_hi   = (argc <= 3) ? count_lo        : strtoul(argv[3],NULL,0);
697   uint32_t const count_step = (argc <= 4) ? count_lo        : strtoul(argv[4],NULL,0);
698   uint32_t const loops      = (argc <= 5) ? HS_BENCH_LOOPS  : strtoul(argv[5],NULL,0);
699   uint32_t const warmup     = (argc <= 6) ? HS_BENCH_WARMUP : strtoul(argv[6],NULL,0);
700   bool     const linearize  = (argc <= 7) ? true            : strtoul(argv[7],NULL,0);
701 
702   //
703   // labels
704   //
705   fprintf(stdout,
706           "Device, "
707           "Driver, "
708           "Type, "
709           "Slab/Linear, "
710           "Verified?, "
711           "Keys, "
712           "Keys Padded In, "
713           "Keys Padded Out, "
714           "CPU Algorithm, "
715           "CPU Msecs, "
716           "CPU Mkeys/s, "
717           "Trials, "
718           "Avg. Msecs, "
719           "Min Msecs, "
720           "Max Msecs, "
721           "Avg. Mkeys/s, "
722           "Max. Mkeys/s\n");
723 
724   //
725   // we want to track driver versions
726   //
727   size_t driver_version_size;
728 
729   cl(GetDeviceInfo(device_id,
730                    CL_DRIVER_VERSION,
731                    0,
732                    NULL,
733                    &driver_version_size));
734 
735   char * const driver_version = ALLOCA_MACRO(driver_version_size);
736 
737   cl(GetDeviceInfo(device_id,
738                    CL_DRIVER_VERSION,
739                    driver_version_size,
740                    driver_version,
741                    NULL));
742   //
743   // benchmark
744   //
745   hs_bench(context,
746            cq,cq_profile,
747            device_name,
748            driver_version,
749            hs_target->config.words.key + hs_target->config.words.val,
750            1 << hs_target->config.slab.width_log2,
751            hs_target->config.slab.height,
752            hs,
753            count_lo,
754            count_hi,
755            count_step,
756            loops,
757            warmup,
758            linearize);
759 
760   //
761   // release everything
762   //
763   hs_cl_release(hs);
764 
765   hs_dummy_kernel_release();
766 
767   cl(ReleaseCommandQueue(cq));
768   cl(ReleaseCommandQueue(cq_profile));
769 
770   cl(ReleaseContext(context));
771 
772   return 0;
773 }
774