1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9 //
10 //
11 //
12
13 #include <stdlib.h>
14 #include <stdio.h>
15 #include <string.h>
16 #include <inttypes.h>
17
18 //
19 // squelch OpenCL 1.2 deprecation warning
20 //
21
22 #ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
23 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
24 #endif
25
26 #include "common/macros.h"
27 #include "common/cl/assert_cl.h"
28 #include "common/cl/find_cl.h"
29 //
30 //
31 //
32
33 #include "hs_cl.h"
34
35 //
36 // FIXME -- LIMITED TO INTEL / GEN8+ FOR NOW
37 //
38
39 #include "intel/gen8/u32/hs_target.h"
40 #include "intel/gen8/u64/hs_target.h"
41
42 // #include "intel/gen9lp/u32/hs_target.h"
43 // #include "intel/gen9lp/u64/hs_target.h"
44
45 //
46 // The quality of the RNG doesn't matter. The same number of
47 // instructions will be run no matter what the key distribution looks
48 // like. So here is something small and fast.
49 //
50
51 static
52 uint32_t
hs_rand_u32()53 hs_rand_u32()
54 {
55 static uint32_t seed = 0xDEADBEEF;
56
57 // Numerical Recipes
58 seed = seed * 1664525 + 1013904223;
59
60 return seed;
61 }
62
63 //
64 //
65 //
66
67 static
68 void
hs_fill_rand(uint32_t * vin_h,uint32_t const count,uint32_t const words)69 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
70 {
71 #if 1
72 for (uint32_t ii=0; ii<count*words; ii++)
73 vin_h[ii] = hs_rand_u32();
74 #elif 0 // in-order
75 memset(vin_h,0,count*words*sizeof(uint32_t));
76 for (uint32_t ii=0; ii<count; ii++)
77 vin_h[ii*words] = ii;
78 #else // reverse order
79 memset(vin_h,0,count*words*sizeof(uint32_t));
80 for (uint32_t ii=0; ii<count; ii++)
81 vin_h[ii*words] = count - 1 - ii;
82 #endif
83 }
84
85 //
86 //
87 //
88
89 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
90 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);
91
92 //
93 //
94 //
95
96 static
97 char const *
hs_cpu_sort(void * sorted_h,uint32_t const hs_words,uint32_t const count,double * const cpu_ns)98 hs_cpu_sort(void * sorted_h,
99 uint32_t const hs_words,
100 uint32_t const count,
101 double * const cpu_ns)
102 {
103 if (hs_words == 1)
104 return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
105 else
106 return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
107 }
108
109 static
110 void
hs_transpose_slabs_u32(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint32_t * vout_h,uint32_t const count)111 hs_transpose_slabs_u32(uint32_t const hs_words,
112 uint32_t const hs_width,
113 uint32_t const hs_height,
114 uint32_t * vout_h,
115 uint32_t const count)
116 {
117 uint32_t const slab_keys = hs_width * hs_height;
118 size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
119 uint32_t * const slab = ALLOCA_MACRO(slab_size);
120 uint32_t slab_count = count / slab_keys;
121
122 while (slab_count-- > 0)
123 {
124 memcpy(slab,vout_h,slab_size);
125
126 for (uint32_t row=0; row<hs_height; row++)
127 for (uint32_t col=0; col<hs_width; col++)
128 vout_h[col * hs_height + row] = slab[row * hs_width + col];
129
130 vout_h += slab_keys;
131 }
132 }
133
134 static
135 void
hs_transpose_slabs_u64(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint64_t * vout_h,uint32_t const count)136 hs_transpose_slabs_u64(uint32_t const hs_words,
137 uint32_t const hs_width,
138 uint32_t const hs_height,
139 uint64_t * vout_h,
140 uint32_t const count)
141 {
142 uint32_t const slab_keys = hs_width * hs_height;
143 size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
144 uint64_t * const slab = ALLOCA_MACRO(slab_size);
145 uint32_t slab_count = count / slab_keys;
146
147 while (slab_count-- > 0)
148 {
149 memcpy(slab,vout_h,slab_size);
150
151 for (uint32_t row=0; row<hs_height; row++)
152 for (uint32_t col=0; col<hs_width; col++)
153 vout_h[col * hs_height + row] = slab[row * hs_width + col];
154
155 vout_h += slab_keys;
156 }
157 }
158
159 static
160 void
hs_transpose_slabs(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,void * vout_h,uint32_t const count)161 hs_transpose_slabs(uint32_t const hs_words,
162 uint32_t const hs_width,
163 uint32_t const hs_height,
164 void * vout_h,
165 uint32_t const count)
166 {
167 if (hs_words == 1)
168 hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
169 else
170 hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
171 }
172
173 //
174 //
175 //
176
177 static
178 void
hs_debug_u32(uint32_t const hs_width,uint32_t const hs_height,uint32_t const * vout_h,uint32_t const count)179 hs_debug_u32(uint32_t const hs_width,
180 uint32_t const hs_height,
181 uint32_t const * vout_h,
182 uint32_t const count)
183 {
184 uint32_t const slab_keys = hs_width * hs_height;
185 uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
186
187 for (uint32_t ss=0; ss<slabs; ss++) {
188 fprintf(stderr,"%u\n",ss);
189 for (uint32_t cc=0; cc<hs_height; cc++) {
190 for (uint32_t rr=0; rr<hs_width; rr++)
191 fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
192 fprintf(stderr,"\n");
193 }
194 }
195 }
196
197 static
198 void
hs_debug_u64(uint32_t const hs_width,uint32_t const hs_height,uint64_t const * vout_h,uint32_t const count)199 hs_debug_u64(uint32_t const hs_width,
200 uint32_t const hs_height,
201 uint64_t const * vout_h,
202 uint32_t const count)
203 {
204 uint32_t const slab_keys = hs_width * hs_height;
205 uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
206
207 for (uint32_t ss=0; ss<slabs; ss++) {
208 fprintf(stderr,"%u\n",ss);
209 for (uint32_t cc=0; cc<hs_height; cc++) {
210 for (uint32_t rr=0; rr<hs_width; rr++)
211 fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
212 fprintf(stderr,"\n");
213 }
214 }
215 }
216
217 //
218 // Used for benchmarking on out-of-order queues. Attaching an event
219 // to a kernel on an OOQ with profiling enabled will result in a
220 // synchronization point and block concurrent execution of kernels.
221 //
222 // The workaround that enables measuring the entire runtime of the
223 // sort is to launch a dummy kernel with an event, a barrier without
224 // an event, then the call to hs_sort(), followed by a final dummy
225 // kernel with an event.
226 //
227 // The end time of the first dummy and start time of the second dummy
228 // will provide a conservative estimate of the total execution time of
229 // the hs_sort() routine.
230 //
231 // Note that once kernels are enqueued they are scheduled with only
232 // microseconds between them so this should only be a small number of
233 // microseconds longer than the true hs_sort() execution time.
234 //
235
236 #define HS_DUMMY_KERNEL_PROGRAM "kernel void hs_dummy_kernel() { ; }"
237
238 static cl_kernel hs_dummy_kernel;
239
240 static
241 void
hs_dummy_kernel_create(cl_context context,cl_device_id device_id)242 hs_dummy_kernel_create(cl_context context, cl_device_id device_id)
243 {
244 cl_int err;
245
246 char const * strings[] = { HS_DUMMY_KERNEL_PROGRAM };
247 size_t const strings_sizeof[] = { sizeof(HS_DUMMY_KERNEL_PROGRAM) };
248
249 cl_program program = clCreateProgramWithSource(context,
250 1,
251 strings,
252 strings_sizeof,
253 &err); cl_ok(err);
254 cl(BuildProgram(program,
255 1,
256 &device_id,
257 NULL,
258 NULL,
259 NULL));
260
261 hs_dummy_kernel = clCreateKernel(program,"hs_dummy_kernel",&err); cl_ok(err);
262
263 cl(ReleaseProgram(program));
264 }
265
266 static
267 void
hs_dummy_kernel_release()268 hs_dummy_kernel_release()
269 {
270 cl(ReleaseKernel(hs_dummy_kernel));
271 }
272
273 static
274 void
hs_dummy_kernel_enqueue(cl_command_queue cq,uint32_t wait_list_size,cl_event const * wait_list,cl_event * event)275 hs_dummy_kernel_enqueue(cl_command_queue cq,
276 uint32_t wait_list_size,
277 cl_event const * wait_list,
278 cl_event * event)
279 {
280 size_t const global_work_size = 1;
281
282 cl(EnqueueNDRangeKernel(cq,
283 hs_dummy_kernel,
284 1,
285 NULL,
286 &global_work_size,
287 NULL,
288 wait_list_size,
289 wait_list,
290 event));
291 }
292
293 //
294 //
295 //
296
297 static
298 void
hs_bench(cl_context context,cl_command_queue cq,cl_command_queue cq_profile,char const * const device_name,char const * const driver_version,uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,struct hs_cl const * const hs,uint32_t const count_lo,uint32_t const count_hi,uint32_t const count_step,uint32_t const loops,uint32_t const warmup,bool const linearize)299 hs_bench(cl_context context,
300 cl_command_queue cq,
301 cl_command_queue cq_profile,
302 char const * const device_name,
303 char const * const driver_version,
304 uint32_t const hs_words,
305 uint32_t const hs_width,
306 uint32_t const hs_height,
307 struct hs_cl const * const hs,
308 uint32_t const count_lo,
309 uint32_t const count_hi,
310 uint32_t const count_step,
311 uint32_t const loops,
312 uint32_t const warmup,
313 bool const linearize)
314 {
315 //
316 // return if nothing to do
317 //
318 if (count_hi <= 1)
319 return;
320
321 //
322 // size the arrays
323 //
324 uint32_t count_hi_padded_in, count_hi_padded_out;
325
326 hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out);
327
328 //
329 // SIZE
330 //
331 size_t const key_size = sizeof(uint32_t) * hs_words;
332
333 size_t const size_hi_in = count_hi_padded_in * key_size;
334 size_t const size_hi_out = count_hi_padded_out * key_size;
335
336 //
337 // ALLOCATE
338 //
339 cl_int cl_err;
340
341 void * sorted_h = malloc(size_hi_in);
342
343 cl_mem random = clCreateBuffer(context,
344 CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
345 size_hi_in,
346 NULL,&cl_err); cl_ok(cl_err);
347
348 cl_mem vin = clCreateBuffer(context,
349 CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
350 size_hi_in,
351 NULL,&cl_err); cl_ok(cl_err);
352
353 cl_mem vout = clCreateBuffer(context,
354 CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
355 size_hi_out,
356 NULL,&cl_err); cl_ok(cl_err);
357 //
358 // BLOCKING MAP AND INIT KEYS
359 //
360 {
361 void * random_h = clEnqueueMapBuffer(cq,
362 random,
363 CL_TRUE,
364 CL_MAP_WRITE_INVALIDATE_REGION,
365 0,size_hi_in,
366 0,NULL,NULL,
367 &cl_err); cl_ok(cl_err);
368
369 // fill with random numbers
370 hs_fill_rand(random_h,count_hi,hs_words);
371
372 //
373 // UNMAP
374 //
375 cl(EnqueueUnmapMemObject(cq,random,random_h,0,NULL,NULL));
376 }
377
378 //
379 // BENCHMARK
380 //
381 for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
382 {
383 // compute padding before sorting
384 uint32_t count_padded_in, count_padded_out;
385
386 hs_cl_pad(hs,count,&count_padded_in,&count_padded_out);
387
388 cl_ulong elapsed_ns_min = UINT64_MAX;
389 cl_ulong elapsed_ns_max = 0;
390 cl_ulong elapsed_ns_sum = 0;
391
392 cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
393 cl(Finish(cq));
394
395 for (uint32_t ii=0; ii<warmup+loops; ii++)
396 {
397 if (ii == warmup)
398 {
399 elapsed_ns_min = UINT64_MAX;
400 elapsed_ns_max = 0;
401 elapsed_ns_sum = 0;
402 }
403
404 #if 0
405 //
406 // optionally, initialize vin on every loop -- no need
407 //
408 cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
409 cl(Finish(cq));
410 #endif
411
412 //
413 // sort vin
414 //
415 cl_event start, complete, end;
416
417 hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start);
418
419 // note hs_sort enqueues a final barrier
420 hs_cl_sort(hs,
421 cq,
422 1,&start,&complete,
423 vin,vout,
424 count,
425 count_padded_in,
426 count_padded_out,
427 linearize);
428
429 hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end);
430
431 cl(Finish(cq_profile));
432
433 //
434 // measure duration
435 //
436 cl_ulong t_start=0, t_end=0;
437
438 // start
439 cl(GetEventProfilingInfo(start,
440 CL_PROFILING_COMMAND_END,
441 sizeof(cl_ulong),
442 &t_start,
443 NULL));
444
445 // end
446 cl(GetEventProfilingInfo(end,
447 CL_PROFILING_COMMAND_START,
448 sizeof(cl_ulong),
449 &t_end,
450 NULL));
451
452 cl_ulong const t = t_end - t_start;
453
454 elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t);
455 elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t);
456 elapsed_ns_sum += t;
457
458 cl(ReleaseEvent(start));
459 cl(ReleaseEvent(complete));
460 cl(ReleaseEvent(end));
461 }
462
463 //
464 // COPY KEYS BACK FOR VERIFICATION
465 //
466 size_t const size_padded_in = count_padded_in * key_size;
467
468 void * vin_h = clEnqueueMapBuffer(cq,
469 vin,
470 CL_FALSE,
471 CL_MAP_READ,
472 0,size_padded_in,
473 0,NULL,NULL,
474 &cl_err); cl_ok(cl_err);
475
476 void * vout_h = clEnqueueMapBuffer(cq,
477 vout,
478 CL_FALSE,
479 CL_MAP_READ,
480 0,size_padded_in,
481 0,NULL,NULL,
482 &cl_err); cl_ok(cl_err);
483 cl(Finish(cq));
484
485 //
486 // SORT THE UNTOUCHED RANDOM INPUT
487 //
488 memcpy(sorted_h,vin_h,size_padded_in);
489
490 double cpu_ns;
491
492 char const * const algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);
493
494 //
495 // EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING
496 //
497 if (!linearize)
498 hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
499
500 //
501 // VERIFY
502 //
503 bool const verified = memcmp(sorted_h,vout_h,size_padded_in) == 0;
504
505 #ifndef NDEBUG
506 if (!verified)
507 {
508 if (hs_words == 1)
509 hs_debug_u32(hs_width,hs_height,vout_h,count);
510 else // ulong
511 hs_debug_u64(hs_width,hs_height,vout_h,count);
512 }
513 #endif
514
515 cl(EnqueueUnmapMemObject(cq,vin, vin_h, 0,NULL,NULL));
516 cl(EnqueueUnmapMemObject(cq,vout,vout_h,0,NULL,NULL));
517
518 cl(Finish(cq));
519
520 //
521 // REPORT
522 //
523 fprintf(stdout,"%s, %s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
524 device_name,
525 driver_version,
526 (hs_words == 1) ? "uint" : "ulong",
527 linearize ? "linear" : "slab",
528 verified ? " OK " : "*FAIL*",
529 count,
530 count_padded_in,
531 count_padded_out,
532 // CPU
533 algo,
534 cpu_ns / 1000000.0, // milliseconds
535 1000.0 * count / cpu_ns, // mkeys / sec
536 // GPU
537 loops,
538 elapsed_ns_sum / 1000000.0 / loops, // avg msecs
539 elapsed_ns_min / 1000000.0, // min msecs
540 elapsed_ns_max / 1000000.0, // max msecs
541 1000.0 * count * loops / elapsed_ns_sum, // mkeys / sec - avg
542 1000.0 * count / elapsed_ns_min); // mkeys / sec - max
543
544 // quit early if not verified
545 if (!verified)
546 break;
547 }
548
549 //
550 // dispose
551 //
552 cl(ReleaseMemObject(vout));
553 cl(ReleaseMemObject(vin));
554 cl(ReleaseMemObject(random));
555 free(sorted_h);
556 }
557
558 //
559 //
560 //
561
562 int
main(int argc,char const * argv[])563 main(int argc, char const * argv[])
564 {
565 char const * const target_platform_substring = "Intel";
566 char const * const target_device_substring = "Graphics";
567
568 //
569 // find platform and device ids
570 //
571 cl_platform_id platform_id;
572 cl_device_id device_id;
573
574 #define HS_DEVICE_NAME_SIZE 64
575
576 char device_name[HS_DEVICE_NAME_SIZE];
577 size_t device_name_size;
578
579 cl(FindIdsByName(target_platform_substring,
580 target_device_substring,
581 &platform_id,
582 &device_id,
583 HS_DEVICE_NAME_SIZE,
584 device_name,
585 &device_name_size,
586 true));
587 //
588 // create context
589 //
590 cl_context_properties context_properties[] =
591 {
592 CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id,
593 0
594 };
595
596 cl_int cl_err;
597 cl_context context = clCreateContext(context_properties,
598 1,
599 &device_id,
600 NULL,
601 NULL,
602 &cl_err);
603 cl_ok(cl_err);
604
605 //
606 // create command queue
607 //
608 #if 0 // OPENCL 2.0
609
610 cl_queue_properties props[] = {
611 CL_QUEUE_PROPERTIES,
612 (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
613 #ifndef NDEBUG
614 (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
615 #endif
616 0
617 };
618
619 cl_queue_properties props_profile[] = {
620 CL_QUEUE_PROPERTIES,
621 (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
622 0
623 };
624
625 cl_command_queue cq = clCreateCommandQueueWithProperties(context,
626 device_id,
627 props,
628 &cl_err); cl_ok(cl_err);
629
630 cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context,
631 device_id,
632 props_profile,
633 &cl_err); cl_ok(cl_err);
634 #else // OPENCL 1.2
635
636 cl_command_queue cq = clCreateCommandQueue(context,
637 device_id,
638 #ifndef NDEBUG
639 CL_QUEUE_PROFILING_ENABLE |
640 #endif
641 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
642 &cl_err); cl_ok(cl_err);
643
644 cl_command_queue cq_profile = clCreateCommandQueue(context,
645 device_id,
646 CL_QUEUE_PROFILING_ENABLE,
647 &cl_err); cl_ok(cl_err);
648 #endif
649
650 //
651 // Intel GEN workaround -- create dummy kernel for semi-accurate
652 // profiling on an out-of-order queue.
653 //
654 hs_dummy_kernel_create(context,device_id);
655
656 //
657 // select the target
658 //
659
660 uint32_t const key_val_words = (argc == 1) ? 2 : strtoul(argv[1],NULL,0);
661
662 struct hs_cl_target const * hs_target;
663
664 if (key_val_words == 1)
665 hs_target = &hs_intel_gen8_u32;
666 else
667 hs_target = &hs_intel_gen8_u64;
668
669 //
670 // create kernels
671 //
672 fprintf(stdout,"Creating... ");
673
674 struct hs_cl * const hs = hs_cl_create(hs_target,context,device_id);
675
676 fprintf(stdout,"done.\n");
677
678 //
679 //
680 //
681
682 #ifdef NDEBUG
683 #define HS_BENCH_LOOPS 100
684 #define HS_BENCH_WARMUP 100
685 #else
686 #define HS_BENCH_LOOPS 1
687 #define HS_BENCH_WARMUP 0
688 #endif
689
690 //
691 // sort sizes and loops
692 //
693 uint32_t const kpb = hs_target->config.slab.height << hs_target->config.slab.width_log2;
694
695 uint32_t const count_lo = (argc <= 2) ? kpb : strtoul(argv[2],NULL,0);
696 uint32_t const count_hi = (argc <= 3) ? count_lo : strtoul(argv[3],NULL,0);
697 uint32_t const count_step = (argc <= 4) ? count_lo : strtoul(argv[4],NULL,0);
698 uint32_t const loops = (argc <= 5) ? HS_BENCH_LOOPS : strtoul(argv[5],NULL,0);
699 uint32_t const warmup = (argc <= 6) ? HS_BENCH_WARMUP : strtoul(argv[6],NULL,0);
700 bool const linearize = (argc <= 7) ? true : strtoul(argv[7],NULL,0);
701
702 //
703 // labels
704 //
705 fprintf(stdout,
706 "Device, "
707 "Driver, "
708 "Type, "
709 "Slab/Linear, "
710 "Verified?, "
711 "Keys, "
712 "Keys Padded In, "
713 "Keys Padded Out, "
714 "CPU Algorithm, "
715 "CPU Msecs, "
716 "CPU Mkeys/s, "
717 "Trials, "
718 "Avg. Msecs, "
719 "Min Msecs, "
720 "Max Msecs, "
721 "Avg. Mkeys/s, "
722 "Max. Mkeys/s\n");
723
724 //
725 // we want to track driver versions
726 //
727 size_t driver_version_size;
728
729 cl(GetDeviceInfo(device_id,
730 CL_DRIVER_VERSION,
731 0,
732 NULL,
733 &driver_version_size));
734
735 char * const driver_version = ALLOCA_MACRO(driver_version_size);
736
737 cl(GetDeviceInfo(device_id,
738 CL_DRIVER_VERSION,
739 driver_version_size,
740 driver_version,
741 NULL));
742 //
743 // benchmark
744 //
745 hs_bench(context,
746 cq,cq_profile,
747 device_name,
748 driver_version,
749 hs_target->config.words.key + hs_target->config.words.val,
750 1 << hs_target->config.slab.width_log2,
751 hs_target->config.slab.height,
752 hs,
753 count_lo,
754 count_hi,
755 count_step,
756 loops,
757 warmup,
758 linearize);
759
760 //
761 // release everything
762 //
763 hs_cl_release(hs);
764
765 hs_dummy_kernel_release();
766
767 cl(ReleaseCommandQueue(cq));
768 cl(ReleaseCommandQueue(cq_profile));
769
770 cl(ReleaseContext(context));
771
772 return 0;
773 }
774