1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9 //
10 //
11 //
12
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <inttypes.h>
17
18 //
19 //
20 //
21
22 #include "common/cl/assert_cl.h"
23 #include "common/macros.h"
24 #include "common/util.h"
25
26 //
27 //
28 //
29
30 #include "hs_cl.h"
31
32 //
33 //
34 //
35
36 struct hs_cl
37 {
38 struct hs_cl_target_config config;
39
40 uint32_t key_val_size;
41 uint32_t slab_keys;
42 uint32_t bs_slabs_log2_ru;
43 uint32_t bc_slabs_log2_max;
44
45 struct {
46 uint32_t count;
47 cl_kernel * bs;
48 cl_kernel * bc;
49 cl_kernel * fm[3];
50 cl_kernel * hm[3];
51 cl_kernel * transpose;
52 cl_kernel all[];
53 } kernels;
54 };
55
56 //
57 //
58 //
59
60 struct hs_state
61 {
62 #ifndef NDEBUG
63 cl_ulong t_total; // 0
64 #endif
65
66 cl_command_queue cq;
67
68 // key buffers
69 cl_mem vin;
70 cl_mem vout; // can be vin
71
72 // enforces ordering on out-of-order queue
73 cl_event wait_list[3]; // worst case
74 uint32_t wait_list_size;
75
76 // bx_ru is number of rounded up warps in vin
77 uint32_t bx_ru;
78 };
79
80 //
81 //
82 //
83
84 static
85 void
hs_state_wait_list_release(struct hs_state * const state)86 hs_state_wait_list_release(struct hs_state * const state)
87 {
88 for (uint32_t ii=0; ii<state->wait_list_size; ii++)
89 cl(ReleaseEvent(state->wait_list[ii]));
90
91 state->wait_list_size = 0;
92 }
93
94 static
95 void
hs_state_wait_list_update(struct hs_state * const state,uint32_t const wait_list_size,cl_event const * const wait_list)96 hs_state_wait_list_update(struct hs_state * const state,
97 uint32_t const wait_list_size,
98 cl_event const * const wait_list)
99 {
100 uint32_t const new_size = state->wait_list_size + wait_list_size;
101
102 for (uint32_t ii=state->wait_list_size; ii<new_size; ii++)
103 state->wait_list[ii] = wait_list[ii];
104
105 state->wait_list_size = new_size;
106 }
107
108 //
109 //
110 //
111
112 #ifdef NDEBUG
113
114 #define HS_STATE_WAIT_LIST_PROFILE(state)
115 #define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list)
116
117 #else
118
119 #include <stdio.h>
120
121 #define HS_STATE_WAIT_LIST_PROFILE(state) \
122 hs_state_wait_list_profile(state, \
123 state->wait_list_size, \
124 state->wait_list)
125
126 #define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list) \
127 hs_state_wait_list_profile(state, \
128 wait_list_size, \
129 wait_list)
130
131 static
132 void
hs_state_wait_list_profile(struct hs_state * const state,uint32_t const wait_list_size,cl_event const * const wait_list)133 hs_state_wait_list_profile(struct hs_state * const state,
134 uint32_t const wait_list_size,
135 cl_event const * const wait_list)
136 {
137 cl(Finish(state->cq));
138
139 cl_command_queue_properties props;
140
141 cl(GetCommandQueueInfo(state->cq,
142 CL_QUEUE_PROPERTIES,
143 sizeof(props),
144 &props,
145 NULL));
146
147 for (uint32_t ii=0; ii<wait_list_size; ii++)
148 {
149 cl_event event = wait_list[ii];
150
151 //
152 // profiling
153 //
154 cl_ulong t_start=0, t_end=0;
155
156 if (props & CL_QUEUE_PROFILING_ENABLE)
157 {
158 // start
159 cl(GetEventProfilingInfo(event,
160 CL_PROFILING_COMMAND_START,
161 sizeof(cl_ulong),
162 &t_start,
163 NULL));
164 // end
165 cl(GetEventProfilingInfo(event,
166 CL_PROFILING_COMMAND_END,
167 sizeof(cl_ulong),
168 &t_end,
169 NULL));
170
171 state->t_total += t_end - t_start;
172 }
173
174 //
175 // status
176 //
177 cl_int status;
178 cl_command_type type;
179
180 cl_get_event_info(event,&status,&type);
181
182 fprintf(stdout,"%-13s, %-28s, %20"PRIu64", %20"PRIu64", %20"PRIu64", %20"PRIu64"\n",
183 cl_get_event_command_status_string(status),
184 cl_get_event_command_type_string(type),
185 t_start,t_end,t_end-t_start,state->t_total);
186 }
187 }
188
189 #endif
190
191 //
192 //
193 //
194
195 #ifdef NDEBUG
196
197 #define HS_TRACE(k,g,l)
198
199 #else
200
201 #define HS_KERNEL_NAME_MAX 20
202
203 static
204 void
hs_trace(cl_kernel kernel,uint32_t const dim,size_t const * const global_work_size)205 hs_trace(cl_kernel kernel,
206 uint32_t const dim,
207 size_t const * const global_work_size)
208 {
209 if (kernel == NULL)
210 return;
211
212 char name[HS_KERNEL_NAME_MAX];
213
214 cl(GetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,HS_KERNEL_NAME_MAX,name,NULL));
215
216 fprintf(stderr,"%-19s ( %6zu, %6zu, %6zu )\n",
217 name,
218 global_work_size[0],
219 dim < 2 ? 0 : global_work_size[1],
220 dim < 3 ? 0 : global_work_size[2]);
221 }
222
223 #define HS_TRACE(k,d,g) hs_trace(k,d,g)
224
225 #endif
226
227 //
228 //
229 //
230
231 static
232 void
hs_transpose(struct hs_cl const * const hs,struct hs_state * const state)233 hs_transpose(struct hs_cl const * const hs,
234 struct hs_state * const state)
235 {
236 size_t const size[1] = { state->bx_ru << hs->config.slab.threads_log2 };
237 cl_kernel kernel = hs->kernels.transpose[0];
238
239 HS_TRACE(kernel,1,size);
240
241 //
242 // The transpose kernel operates on a single slab. For now, let's
243 // rely on the driver to choose a workgroup size.
244 //
245 // size_t local_work_size[1] = { HS_SLAB_THREADS };
246 //
247 cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
248
249 cl_event wait_list_out[1];
250
251 cl(EnqueueNDRangeKernel(state->cq,
252 kernel,
253 1,
254 NULL,
255 size,
256 NULL,
257 state->wait_list_size,
258 state->wait_list,
259 wait_list_out));
260
261 hs_state_wait_list_release(state);
262 hs_state_wait_list_update(state,1,wait_list_out);
263
264 HS_STATE_WAIT_LIST_PROFILE(state);
265 }
266
267 //
268 //
269 //
270
271 static
272 void
hs_hm_enqueue(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const scale_log2,uint32_t const spans,uint32_t const span_threads)273 hs_hm_enqueue(struct hs_cl const * const hs,
274 struct hs_state * const state,
275 uint32_t const scale_log2,
276 uint32_t const spans,
277 uint32_t const span_threads)
278 {
279 //
280 // Note that some platforms might need to use .z on large grids
281 //
282 size_t const size[3] = { span_threads, spans, 1 };
283 cl_kernel kernel = hs->kernels.hm[scale_log2][0];
284
285 HS_TRACE(kernel,3,size);
286
287 cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
288
289 cl_event wait_list_out[1];
290
291 cl(EnqueueNDRangeKernel(state->cq,
292 kernel,
293 3,
294 NULL,
295 size,
296 NULL,
297 state->wait_list_size,
298 state->wait_list,
299 wait_list_out));
300
301 hs_state_wait_list_release(state);
302 hs_state_wait_list_update(state,1,wait_list_out);
303
304 HS_STATE_WAIT_LIST_PROFILE(state);
305 }
306
307 //
308 //
309 //
310
311 static
312 uint32_t
hs_hm(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const down_slabs,uint32_t const clean_slabs_log2)313 hs_hm(struct hs_cl const * const hs,
314 struct hs_state * const state,
315 uint32_t const down_slabs,
316 uint32_t const clean_slabs_log2)
317 {
318 // how many scaled half-merge spans are there?
319 uint32_t const frac_ru = (1 << clean_slabs_log2) - 1;
320 uint32_t const spans = (down_slabs + frac_ru) >> clean_slabs_log2;
321
322 // for now, just clamp to the max
323 uint32_t const log2_rem = clean_slabs_log2 - hs->bc_slabs_log2_max;
324 uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem);
325 uint32_t const log2_out = log2_rem - scale_log2;
326
327 // size the grid
328 uint32_t const span_threads = hs->slab_keys << log2_out;
329
330 // launch the hm kernel
331 hs_hm_enqueue(hs,
332 state,
333 scale_log2,
334 spans,
335 span_threads);
336
337 return log2_out;
338 }
339
340 //
341 //
342 //
343
344 static
345 void
hs_bc_enqueue(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const full,uint32_t const clean_slabs_log2)346 hs_bc_enqueue(struct hs_cl const * const hs,
347 struct hs_state * const state,
348 uint32_t const full,
349 uint32_t const clean_slabs_log2)
350 {
351 size_t const size[1] = { full << hs->config.slab.threads_log2 };
352 cl_kernel kernel = hs->kernels.bc[clean_slabs_log2];
353
354 HS_TRACE(kernel,1,size);
355
356 cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
357
358 cl_event wait_list_out[1];
359
360 cl(EnqueueNDRangeKernel(state->cq,
361 kernel,
362 1,
363 NULL,
364 size,
365 NULL,
366 state->wait_list_size,
367 state->wait_list,
368 wait_list_out));
369
370 hs_state_wait_list_release(state);
371 hs_state_wait_list_update(state,1,wait_list_out);
372
373 HS_STATE_WAIT_LIST_PROFILE(state);
374 }
375
376 //
377 //
378 //
379
380 static
381 void
hs_bc(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const down_slabs,uint32_t const clean_slabs_log2)382 hs_bc(struct hs_cl const * const hs,
383 struct hs_state * const state,
384 uint32_t const down_slabs,
385 uint32_t const clean_slabs_log2)
386 {
387 // block clean the minimal number of down_slabs_log2 spans
388 uint32_t const frac_ru = (1u << clean_slabs_log2) - 1;
389 uint32_t const full = (down_slabs + frac_ru) & ~frac_ru;
390
391 // we better be capable of cleaning at least two warps !!!
392 hs_bc_enqueue(hs,state,full,clean_slabs_log2);
393 }
394
395 //
396 // FIXME -- some of this logic can be skipped if BS is a power-of-two
397 //
398
399 static
400 void
hs_fm_enqueue(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const scale_log2,uint32_t const fm_full,uint32_t const fm_frac,uint32_t const span_threads)401 hs_fm_enqueue(struct hs_cl const * const hs,
402 struct hs_state * const state,
403 uint32_t const scale_log2,
404 uint32_t const fm_full,
405 uint32_t const fm_frac,
406 uint32_t const span_threads)
407 {
408 //
409 // Note that some platforms might need to use .z on large grids
410 //
411 uint32_t wait_list_out_size = 0;
412 cl_event wait_list_out[2];
413
414 if (fm_full > 0)
415 {
416 size_t const size_full[3] = { span_threads, fm_full, 1 };
417 cl_kernel kernel_full = hs->kernels.fm[scale_log2][hs->bs_slabs_log2_ru-1+scale_log2];
418
419 HS_TRACE(kernel_full,3,size_full);
420
421 cl(SetKernelArg(kernel_full,0,sizeof(state->vout),&state->vout));
422
423 cl(EnqueueNDRangeKernel(state->cq,
424 kernel_full,
425 3,
426 NULL,
427 size_full,
428 NULL,
429 state->wait_list_size,
430 state->wait_list,
431 wait_list_out+wait_list_out_size++));
432 }
433
434 if (fm_frac > 0)
435 {
436 size_t const offset_frac[3] = { 0, fm_full, 0 };
437 size_t const size_frac [3] = { span_threads, 1, 1 };
438 cl_kernel kernel_frac = hs->kernels.fm[scale_log2][msb_idx_u32(fm_frac)];
439
440 HS_TRACE(kernel_frac,3,size_frac);
441
442 cl(SetKernelArg(kernel_frac,0,sizeof(state->vout),&state->vout));
443
444 cl(EnqueueNDRangeKernel(state->cq,
445 kernel_frac,
446 3,
447 offset_frac,
448 size_frac,
449 NULL,
450 state->wait_list_size,
451 state->wait_list,
452 wait_list_out+wait_list_out_size++));
453 }
454
455 hs_state_wait_list_release(state);
456 hs_state_wait_list_update(state,wait_list_out_size,wait_list_out);
457
458 HS_STATE_WAIT_LIST_PROFILE(state);
459 }
460
461 //
462 //
463 //
464
465 static
466 uint32_t
hs_fm(struct hs_cl const * const hs,struct hs_state * const state,uint32_t * const down_slabs,uint32_t const up_scale_log2)467 hs_fm(struct hs_cl const * const hs,
468 struct hs_state * const state,
469 uint32_t * const down_slabs,
470 uint32_t const up_scale_log2)
471 {
472 //
473 // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes
474 // a performance win to bias toward launching the smaller flip merge
475 // kernel in order to get more warps in flight (increased
476 // occupancy). This is useful when merging small numbers of slabs.
477 //
478 // Note that HS_FM_SCALE_MIN will always be 0 or 1.
479 //
480 // So, for now, just clamp to the max until there is a reason to
481 // restore the fancier and probably low-impact approach.
482 //
483 uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2);
484 uint32_t const clean_log2 = up_scale_log2 - scale_log2;
485
486 // number of slabs in a full-sized scaled flip-merge span
487 uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2;
488
489 // how many full-sized scaled flip-merge spans are there?
490 uint32_t fm_full = state->bx_ru / full_span_slabs;
491 uint32_t fm_frac = 0;
492
493 // initialize down_slabs
494 *down_slabs = fm_full * full_span_slabs;
495
496 // how many half-size scaled + fractional scaled spans are there?
497 uint32_t const span_rem = state->bx_ru - *down_slabs;
498 uint32_t const half_span_slabs = full_span_slabs >> 1;
499
500 // if we have over a half-span then fractionally merge it
501 if (span_rem > half_span_slabs)
502 {
503 // the remaining slabs will be cleaned
504 *down_slabs += span_rem;
505
506 uint32_t const frac_rem = span_rem - half_span_slabs;
507 uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem);
508
509 if (frac_rem_pow2 >= half_span_slabs)
510 {
511 // bump it up to a full span
512 fm_full += 1;
513 }
514 else
515 {
516 // otherwise, add fractional
517 fm_frac = MAX_MACRO(1,frac_rem_pow2 >> clean_log2);
518 }
519 }
520
521 // size the grid
522 uint32_t const span_threads = hs->slab_keys << clean_log2;
523
524 //
525 // launch the fm kernel
526 //
527 hs_fm_enqueue(hs,
528 state,
529 scale_log2,
530 fm_full,
531 fm_frac,
532 span_threads);
533
534 return clean_log2;
535 }
536
537 //
538 //
539 //
540
541 static
542 void
hs_bs_enqueue(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const full,uint32_t const frac,uint32_t const wait_list_size,cl_event * wait_list)543 hs_bs_enqueue(struct hs_cl const * const hs,
544 struct hs_state * const state,
545 uint32_t const full,
546 uint32_t const frac,
547 uint32_t const wait_list_size,
548 cl_event * wait_list)
549 {
550 uint32_t wait_list_out_size = 0;
551 cl_event wait_list_out[2];
552
553 if (full > 0)
554 {
555 size_t const size_full[1] = { full << hs->config.slab.threads_log2 };
556 cl_kernel kernel_full = hs->kernels.bs[hs->bs_slabs_log2_ru];
557
558 HS_TRACE(kernel_full,1,size_full);
559
560 cl(SetKernelArg(kernel_full,0,sizeof(state->vin), &state->vin));
561 cl(SetKernelArg(kernel_full,1,sizeof(state->vout),&state->vout));
562
563 cl(EnqueueNDRangeKernel(state->cq,
564 kernel_full,
565 1,
566 NULL,
567 size_full,
568 NULL,
569 wait_list_size,
570 wait_list,
571 wait_list_out+wait_list_out_size++));
572 }
573
574 if (frac > 0)
575 {
576 size_t const offset_frac[1] = { full << hs->config.slab.threads_log2 };
577 size_t const size_frac [1] = { frac << hs->config.slab.threads_log2 };
578 cl_kernel kernel_frac = hs->kernels.bs[msb_idx_u32(frac)];
579
580 HS_TRACE(kernel_frac,1,size_frac);
581
582 cl(SetKernelArg(kernel_frac,0,sizeof(state->vin), &state->vin));
583 cl(SetKernelArg(kernel_frac,1,sizeof(state->vout),&state->vout));
584
585 cl(EnqueueNDRangeKernel(state->cq,
586 kernel_frac,
587 1,
588 offset_frac,
589 size_frac,
590 NULL,
591 wait_list_size,
592 wait_list,
593 wait_list_out+wait_list_out_size++));
594 }
595
596 hs_state_wait_list_release(state);
597 hs_state_wait_list_update(state,wait_list_out_size,wait_list_out);
598
599 HS_STATE_WAIT_LIST_PROFILE(state);
600 }
601
602 //
603 //
604 //
605
606 static
607 void
hs_bs(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const count_padded_in,uint32_t const wait_list_size,cl_event * wait_list)608 hs_bs(struct hs_cl const * const hs,
609 struct hs_state * const state,
610 uint32_t const count_padded_in,
611 uint32_t const wait_list_size,
612 cl_event * wait_list)
613 {
614 uint32_t const slabs_in = count_padded_in / hs->slab_keys;
615 uint32_t const full = (slabs_in / hs->config.block.slabs) * hs->config.block.slabs;
616 uint32_t const frac = slabs_in - full;
617
618 hs_bs_enqueue(hs,state,
619 full,frac,
620 wait_list_size,wait_list);
621 }
622
623 //
624 //
625 //
626
627 static
628 void
hs_keyset_pre_sort(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const count,uint32_t const count_hi,uint32_t const wait_list_size,cl_event * wait_list,cl_event * event)629 hs_keyset_pre_sort(struct hs_cl const * const hs,
630 struct hs_state * const state,
631 uint32_t const count,
632 uint32_t const count_hi,
633 uint32_t const wait_list_size,
634 cl_event * wait_list,
635 cl_event * event)
636 {
637 uint32_t const vin_span = count_hi - count;
638 uint32_t const pattern = UINT32_MAX;
639
640 cl(EnqueueFillBuffer(state->cq,
641 state->vin,
642 &pattern,
643 sizeof(pattern),
644 count * hs->key_val_size,
645 vin_span * hs->key_val_size,
646 wait_list_size,
647 wait_list,
648 event));
649
650 HS_STATE_WAIT_LIST_PROFILE_EX(state,1,event);
651 }
652
653 //
654 //
655 //
656
657 static
658 void
hs_keyset_pre_merge(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const count_lo,uint32_t const count_hi,uint32_t const wait_list_size,cl_event * wait_list)659 hs_keyset_pre_merge(struct hs_cl const * const hs,
660 struct hs_state * const state,
661 uint32_t const count_lo,
662 uint32_t const count_hi,
663 uint32_t const wait_list_size,
664 cl_event * wait_list)
665 {
666 uint32_t const vout_span = count_hi - count_lo;
667 uint32_t const pattern = UINT32_MAX;
668
669 // appends event to incoming wait list
670 cl(EnqueueFillBuffer(state->cq,
671 state->vout,
672 &pattern,
673 sizeof(pattern),
674 count_lo * hs->key_val_size,
675 vout_span * hs->key_val_size,
676 wait_list_size,
677 wait_list,
678 state->wait_list+state->wait_list_size++));
679
680 HS_STATE_WAIT_LIST_PROFILE(state);
681 }
682
683 //
684 // We want concurrent kernel execution to occur in a few places.
685 //
686 // The summary is:
687 //
688 // 1) If necessary, some max valued keys are written to the end of
689 // the vin/vout buffers.
690 //
691 // 2) Blocks of slabs of keys are sorted.
692 //
693 // 3) If necesary, the blocks of slabs are merged until complete.
694 //
695 // 4) If requested, the slabs will be converted from slab ordering
696 // to linear ordering.
697 //
698 // Below is the general "happens-before" relationship between HotSort
699 // compute kernels.
700 //
701 // Note the diagram assumes vin and vout are different buffers. If
702 // they're not, then the first merge doesn't include the pad_vout
703 // event in the wait list.
704 //
705 // +----------+ +---------+
706 // | pad_vout | | pad_vin |
707 // +----+-----+ +----+----+
708 // | |
709 // | WAITFOR(pad_vin)
710 // | |
711 // | +-----v-----+
712 // | | |
713 // | +----v----+ +----v----+
714 // | | bs_full | | bs_frac |
715 // | +----+----+ +----+----+
716 // | | |
717 // | +-----v-----+
718 // | |
719 // | +------NO------JUST ONE BLOCK?
720 // | / |
721 // |/ YES
722 // + |
723 // | v
724 // | END_WITH_EVENTS(bs_full,bs_frac)
725 // |
726 // |
727 // WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<<
728 // |
729 // |
730 // +-----------<------------+
731 // | |
732 // +-----v-----+ |
733 // | | |
734 // +----v----+ +----v----+ |
735 // | fm_full | | fm_frac | |
736 // +----+----+ +----+----+ |
737 // | | ^
738 // +-----v-----+ |
739 // | |
740 // WAITFOR(fm_full,fm_frac) |
741 // | |
742 // v |
743 // +--v--+ WAITFOR(bc)
744 // | hm | |
745 // +-----+ |
746 // | |
747 // WAITFOR(hm) |
748 // | ^
749 // +--v--+ |
750 // | bc | |
751 // +-----+ |
752 // | |
753 // v |
754 // MERGING COMPLETE?-------NO------+
755 // |
756 // YES
757 // |
758 // v
759 // END_WITH_EVENTS(bc)
760 //
761
762 void
hs_cl_sort(struct hs_cl const * const hs,cl_command_queue cq,uint32_t const wait_list_size,cl_event * wait_list,cl_event * event,cl_mem vin,cl_mem vout,uint32_t const count,uint32_t const count_padded_in,uint32_t const count_padded_out,bool const linearize)763 hs_cl_sort(struct hs_cl const * const hs,
764 cl_command_queue cq,
765 uint32_t const wait_list_size,
766 cl_event * wait_list,
767 cl_event * event,
768 cl_mem vin,
769 cl_mem vout,
770 uint32_t const count,
771 uint32_t const count_padded_in,
772 uint32_t const count_padded_out,
773 bool const linearize)
774 {
775 // is this sort in place?
776 bool const is_in_place = (vout == NULL);
777
778 // cq, buffers, wait list and slab count
779 struct hs_state state = {
780 #ifndef NDEBUG
781 .t_total = 0,
782 #endif
783 .cq = cq,
784 .vin = vin,
785 .vout = is_in_place ? vin : vout,
786 .wait_list_size = 0,
787 .bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys
788 };
789
790 // initialize vin
791 uint32_t const count_hi = is_in_place ? count_padded_out : count_padded_in;
792 bool const is_pre_sort_keyset_reqd = count_hi > count;
793 cl_event event_keyset_pre_sort[1];
794
795 // initialize any trailing keys in vin before sorting
796 if (is_pre_sort_keyset_reqd)
797 {
798 hs_keyset_pre_sort(hs,&state,
799 count,count_hi,
800 wait_list_size,wait_list,
801 event_keyset_pre_sort);
802 }
803
804 // initialize any trailing keys in vout before merging
805 if (!is_in_place && (count_padded_out > count_padded_in))
806 {
807 hs_keyset_pre_merge(hs,&state,
808 count_padded_in,count_padded_out,
809 wait_list_size,wait_list);
810 }
811
812 //
813 // sort blocks of slabs
814 //
815 hs_bs(hs,&state,
816 count_padded_in,
817 is_pre_sort_keyset_reqd ? 1 : wait_list_size,
818 is_pre_sort_keyset_reqd ? event_keyset_pre_sort : wait_list);
819
820 // release the event
821 if (is_pre_sort_keyset_reqd)
822 cl(ReleaseEvent(event_keyset_pre_sort[0]));
823
824 //
825 // we're done if this was a single bs block...
826 //
827 // otherwise, merge sorted spans of slabs until done
828 //
829 if (state.bx_ru > hs->config.block.slabs)
830 {
831 int32_t up_scale_log2 = 1;
832
833 while (true)
834 {
835 uint32_t down_slabs;
836
837 // flip merge slabs -- return span of slabs that must be cleaned
838 uint32_t clean_slabs_log2 = hs_fm(hs,&state,
839 &down_slabs,
840 up_scale_log2);
841
842 // if span is gt largest slab block cleaner then half merge
843 while (clean_slabs_log2 > hs->bc_slabs_log2_max)
844 {
845 clean_slabs_log2 = hs_hm(hs,&state,
846 down_slabs,
847 clean_slabs_log2);
848 }
849
850 // launch clean slab grid -- is it the final launch?
851 hs_bc(hs,&state,down_slabs,clean_slabs_log2);
852
853 // was this the final block clean?
854 if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru)
855 break;
856
857 // otherwise, merge twice as many slabs
858 up_scale_log2 += 1;
859 }
860 }
861
862 // slabs or linear?
863 if (linearize) {
864 hs_transpose(hs,&state);
865 }
866
867 // does the caller want the final event?
868 if (event != NULL) {
869 *event = state.wait_list[0];
870 } else {
871 cl(ReleaseEvent(state.wait_list[0]));
872 }
873 }
874
875 //
876 // all grids will be computed as a function of the minimum number of slabs
877 //
878
879 void
hs_cl_pad(struct hs_cl const * const hs,uint32_t const count,uint32_t * const count_padded_in,uint32_t * const count_padded_out)880 hs_cl_pad(struct hs_cl const * const hs,
881 uint32_t const count,
882 uint32_t * const count_padded_in,
883 uint32_t * const count_padded_out)
884 {
885 //
886 // round up the count to slabs
887 //
888 uint32_t const slabs_ru = (count + hs->slab_keys - 1) / hs->slab_keys;
889 uint32_t const blocks = slabs_ru / hs->config.block.slabs;
890 uint32_t const block_slabs = blocks * hs->config.block.slabs;
891 uint32_t const slabs_ru_rem = slabs_ru - block_slabs;
892 uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs);
893
894 *count_padded_in = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys;
895 *count_padded_out = *count_padded_in;
896
897 //
898 // will merging be required?
899 //
900 if (slabs_ru > hs->config.block.slabs)
901 {
902 // more than one block
903 uint32_t const blocks_lo = pow2_rd_u32(blocks);
904 uint32_t const block_slabs_lo = blocks_lo * hs->config.block.slabs;
905 uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo;
906
907 if (block_slabs_rem > 0)
908 {
909 uint32_t const block_slabs_rem_ru = pow2_ru_u32(block_slabs_rem);
910
911 uint32_t const block_slabs_hi = MAX_MACRO(block_slabs_rem_ru,
912 blocks_lo << (1 - hs->config.merge.fm.scale_min));
913
914 uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi,
915 block_slabs_lo*2); // clamp non-pow2 blocks
916
917 *count_padded_out = block_slabs_padded_out * hs->slab_keys;
918 }
919 }
920 }
921
922 //
923 //
924 //
925
926 static
927 void
hs_create_kernel(cl_program program,cl_kernel * const kernel,char const * const name)928 hs_create_kernel(cl_program program,
929 cl_kernel * const kernel,
930 char const * const name)
931 {
932 cl_int err;
933
934 *kernel = clCreateKernel(program,name,&err);
935
936 cl_ok(err);
937 }
938
939 static
940 void
hs_create_kernels(cl_program program,cl_kernel * kernels,char name_template[],size_t const name_template_size,uint32_t const count)941 hs_create_kernels(cl_program program,
942 cl_kernel * kernels,
943 char name_template[],
944 size_t const name_template_size,
945 uint32_t const count)
946 {
947 char const n_max = '0'+(char)count;
948
949 for (char n = '0'; n<n_max; n++)
950 {
951 cl_int err;
952
953 name_template[name_template_size-2] = n;
954
955 *kernels++ = clCreateKernel(program,name_template,&err);
956
957 cl_ok(err);
958 }
959 }
960
961 //
962 //
963 //
964
965 struct hs_cl *
hs_cl_create(struct hs_cl_target const * const target,cl_context context,cl_device_id device_id)966 hs_cl_create(struct hs_cl_target const * const target,
967 cl_context context,
968 cl_device_id device_id)
969 {
970 //
971 // immediately try to build the OpenCL program
972 //
973 bool const is_binary = (target->program[0] == 0);
974 uint32_t const program_size = NPBTOHL_MACRO(target->program+1);
975
976 cl_program program;
977
978 if (is_binary) // program is a binary
979 {
980 cl_int status, err;
981
982 size_t const bins_sizeof[] = { program_size };
983 unsigned char const * bins[] = { target->program+5 };
984
985 program = clCreateProgramWithBinary(context,
986 1,
987 &device_id,
988 bins_sizeof,
989 bins,
990 &status,
991 &err);
992 cl_ok(err);
993
994 fprintf(stdout,"Building binary... ");
995
996 fflush(stdout);
997
998 cl(BuildProgram(program,
999 1,
1000 &device_id,
1001 NULL,
1002 NULL,
1003 NULL));
1004 }
1005 else // program is source code
1006 {
1007 cl_int err;
1008
1009 size_t const strings_sizeof[] = { program_size };
1010 char const * strings[] = { (char*)target->program+5 };
1011
1012 program = clCreateProgramWithSource(context,
1013 1,
1014 strings,
1015 strings_sizeof,
1016 &err);
1017 cl_ok(err);
1018
1019 char const * const options =
1020 "-cl-std=CL1.2 -cl-fast-relaxed-math "
1021 "-cl-no-signed-zeros -cl-mad-enable "
1022 "-cl-denorms-are-zero "
1023 "-cl-kernel-arg-info";
1024
1025 fprintf(stdout,"Building source... ");
1026
1027 fflush(stdout);
1028
1029 cl(BuildProgram(program,
1030 1,
1031 &device_id,
1032 options,
1033 NULL,
1034 NULL));
1035 }
1036
1037 //
1038 // we reference these values a lot
1039 //
1040 uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs));
1041 uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs));
1042
1043 //
1044 // how many kernels will be created?
1045 //
1046 uint32_t const count_bs = bs_slabs_log2_ru + 1;
1047 uint32_t const count_bc = bc_slabs_log2_max + 1;
1048 uint32_t count_fm[3] = { 0 };
1049 uint32_t count_hm[3] = { 0 };
1050
1051 // guaranteed to be in range [0,2]
1052 for (uint32_t scale = target->config.merge.fm.scale_min;
1053 scale <= target->config.merge.fm.scale_max;
1054 scale++)
1055 {
1056 uint32_t fm_left = (target->config.block.slabs / 2) << scale;
1057
1058 count_fm[scale] = msb_idx_u32(pow2_ru_u32(fm_left)) + 1;
1059 }
1060
1061 // guaranteed to be in range [0,2]
1062 for (uint32_t scale = target->config.merge.hm.scale_min;
1063 scale <= target->config.merge.hm.scale_max;
1064 scale++)
1065 {
1066 count_hm[scale] = 1;
1067 }
1068
1069 uint32_t const count_all =
1070 1
1071 + count_bs
1072 + count_bc
1073 + count_fm[0] + count_fm[1] + count_fm[2]
1074 + count_hm[0] + count_hm[1] + count_hm[2];
1075
1076 //
1077 // allocate hs_cl
1078 //
1079 struct hs_cl * hs = malloc(sizeof(*hs) + sizeof(cl_kernel) * count_all);
1080
1081 memcpy(&hs->config,&target->config,sizeof(hs->config));
1082
1083 // save some frequently used calculated values
1084 hs->key_val_size = (target->config.words.key + target->config.words.val) * 4;
1085 hs->slab_keys = target->config.slab.height << target->config.slab.width_log2;
1086 hs->bs_slabs_log2_ru = bs_slabs_log2_ru;
1087 hs->bc_slabs_log2_max = bc_slabs_log2_max;
1088
1089 // save kernel count
1090 hs->kernels.count = count_all;
1091
1092 //
1093 // create all the kernels and release the program
1094 //
1095 cl_kernel * kernel_next = hs->kernels.all;
1096
1097 //
1098 // BS
1099 //
1100 {
1101 hs->kernels.bs = kernel_next;
1102
1103 char bs_name[] = { "hs_kernel_bs_X" };
1104
1105 hs_create_kernels(program,
1106 kernel_next,
1107 bs_name,sizeof(bs_name),
1108 count_bs);
1109
1110 kernel_next += count_bs;
1111 }
1112
1113 //
1114 // BC
1115 //
1116 {
1117 hs->kernels.bc = kernel_next;
1118
1119 char bc_name[] = { "hs_kernel_bc_X" };
1120
1121 hs_create_kernels(program,
1122 kernel_next,
1123 bc_name,sizeof(bc_name),
1124 count_bc);
1125
1126 kernel_next += count_bc;
1127 }
1128
1129 //
1130 // FM
1131 //
1132 if (count_fm[0] > 0)
1133 {
1134 hs->kernels.fm[0] = kernel_next;
1135
1136 char fm_0_name[] = { "hs_kernel_fm_0_X" };
1137
1138 hs_create_kernels(program,
1139 kernel_next,
1140 fm_0_name,sizeof(fm_0_name),
1141 count_fm[0]);
1142
1143 kernel_next += count_fm[0];
1144 }
1145
1146 if (count_fm[1] > 0)
1147 {
1148 hs->kernels.fm[1] = kernel_next;
1149
1150 char fm_1_name[] = { "hs_kernel_fm_1_X" };
1151
1152 hs_create_kernels(program,
1153 kernel_next,
1154 fm_1_name,sizeof(fm_1_name),
1155 count_fm[1]);
1156
1157 kernel_next += count_fm[1];
1158 }
1159
1160 if (count_fm[2] > 0)
1161 {
1162 hs->kernels.fm[2] = kernel_next;
1163
1164 char fm_2_name[] = { "hs_kernel_fm_2_X" };
1165
1166 hs_create_kernels(program,
1167 kernel_next,
1168 fm_2_name,sizeof(fm_2_name),
1169 count_fm[2]);
1170
1171 kernel_next += count_fm[2];
1172 }
1173
1174 //
1175 // HM
1176 //
1177 if (count_hm[0] > 0)
1178 {
1179 hs->kernels.hm[0] = kernel_next;
1180
1181 hs_create_kernel(program,
1182 kernel_next,
1183 "hs_kernel_hm_0");
1184
1185 kernel_next += count_hm[0];
1186 }
1187
1188 if (count_hm[1] > 0)
1189 {
1190 hs->kernels.hm[1] = kernel_next;
1191
1192 hs_create_kernel(program,
1193 kernel_next,
1194 "hs_kernel_hm_1");
1195
1196 kernel_next += count_hm[1];
1197 }
1198
1199 if (count_hm[2] > 0)
1200 {
1201 hs->kernels.hm[2] = kernel_next;
1202
1203 hs_create_kernel(program,
1204 kernel_next,
1205 "hs_kernel_hm_2");
1206
1207 kernel_next += count_hm[2]; // unnecessary
1208 }
1209
1210 //
1211 // TRANSPOSE
1212 //
1213 {
1214 hs->kernels.transpose = kernel_next;
1215
1216 hs_create_kernel(program,
1217 kernel_next,
1218 "hs_kernel_transpose");
1219
1220 kernel_next += 1;
1221 }
1222
1223 return hs;
1224 }
1225
1226 //
1227 //
1228 //
1229
1230 void
hs_cl_release(struct hs_cl * const hs)1231 hs_cl_release(struct hs_cl * const hs)
1232 {
1233 for (uint32_t ii=0; ii<hs->kernels.count; ii++)
1234 cl(ReleaseKernel(hs->kernels.all[ii]));
1235
1236 free(hs);
1237 }
1238
1239 //
1240 //
1241 //
1242