Lines Matching refs:hs
233 hs_transpose(struct hs_cl const * const hs, in hs_transpose() argument
236 size_t const size[1] = { state->bx_ru << hs->config.slab.threads_log2 }; in hs_transpose()
237 cl_kernel kernel = hs->kernels.transpose[0]; in hs_transpose()
273 hs_hm_enqueue(struct hs_cl const * const hs, in hs_hm_enqueue() argument
283 cl_kernel kernel = hs->kernels.hm[scale_log2][0]; in hs_hm_enqueue()
313 hs_hm(struct hs_cl const * const hs, in hs_hm() argument
323 uint32_t const log2_rem = clean_slabs_log2 - hs->bc_slabs_log2_max; in hs_hm()
324 uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem); in hs_hm()
328 uint32_t const span_threads = hs->slab_keys << log2_out; in hs_hm()
331 hs_hm_enqueue(hs, in hs_hm()
346 hs_bc_enqueue(struct hs_cl const * const hs, in hs_bc_enqueue() argument
351 size_t const size[1] = { full << hs->config.slab.threads_log2 }; in hs_bc_enqueue()
352 cl_kernel kernel = hs->kernels.bc[clean_slabs_log2]; in hs_bc_enqueue()
382 hs_bc(struct hs_cl const * const hs, in hs_bc() argument
392 hs_bc_enqueue(hs,state,full,clean_slabs_log2); in hs_bc()
401 hs_fm_enqueue(struct hs_cl const * const hs, in hs_fm_enqueue() argument
417 cl_kernel kernel_full = hs->kernels.fm[scale_log2][hs->bs_slabs_log2_ru-1+scale_log2]; in hs_fm_enqueue()
438 cl_kernel kernel_frac = hs->kernels.fm[scale_log2][msb_idx_u32(fm_frac)]; in hs_fm_enqueue()
467 hs_fm(struct hs_cl const * const hs, in hs_fm() argument
483 uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2); in hs_fm()
487 uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2; in hs_fm()
522 uint32_t const span_threads = hs->slab_keys << clean_log2; in hs_fm()
527 hs_fm_enqueue(hs, in hs_fm()
543 hs_bs_enqueue(struct hs_cl const * const hs, in hs_bs_enqueue() argument
555 size_t const size_full[1] = { full << hs->config.slab.threads_log2 }; in hs_bs_enqueue()
556 cl_kernel kernel_full = hs->kernels.bs[hs->bs_slabs_log2_ru]; in hs_bs_enqueue()
576 size_t const offset_frac[1] = { full << hs->config.slab.threads_log2 }; in hs_bs_enqueue()
577 size_t const size_frac [1] = { frac << hs->config.slab.threads_log2 }; in hs_bs_enqueue()
578 cl_kernel kernel_frac = hs->kernels.bs[msb_idx_u32(frac)]; in hs_bs_enqueue()
608 hs_bs(struct hs_cl const * const hs, in hs_bs() argument
614 uint32_t const slabs_in = count_padded_in / hs->slab_keys; in hs_bs()
615 uint32_t const full = (slabs_in / hs->config.block.slabs) * hs->config.block.slabs; in hs_bs()
618 hs_bs_enqueue(hs,state, in hs_bs()
629 hs_keyset_pre_sort(struct hs_cl const * const hs, in hs_keyset_pre_sort() argument
644 count * hs->key_val_size, in hs_keyset_pre_sort()
645 vin_span * hs->key_val_size, in hs_keyset_pre_sort()
659 hs_keyset_pre_merge(struct hs_cl const * const hs, in hs_keyset_pre_merge() argument
674 count_lo * hs->key_val_size, in hs_keyset_pre_merge()
675 vout_span * hs->key_val_size, in hs_keyset_pre_merge()
763 hs_cl_sort(struct hs_cl const * const hs, in hs_cl_sort() argument
787 .bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys in hs_cl_sort()
798 hs_keyset_pre_sort(hs,&state, in hs_cl_sort()
807 hs_keyset_pre_merge(hs,&state, in hs_cl_sort()
815 hs_bs(hs,&state, in hs_cl_sort()
829 if (state.bx_ru > hs->config.block.slabs) in hs_cl_sort()
838 uint32_t clean_slabs_log2 = hs_fm(hs,&state, in hs_cl_sort()
843 while (clean_slabs_log2 > hs->bc_slabs_log2_max) in hs_cl_sort()
845 clean_slabs_log2 = hs_hm(hs,&state, in hs_cl_sort()
851 hs_bc(hs,&state,down_slabs,clean_slabs_log2); in hs_cl_sort()
854 if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru) in hs_cl_sort()
864 hs_transpose(hs,&state); in hs_cl_sort()
880 hs_cl_pad(struct hs_cl const * const hs, in hs_cl_pad() argument
888 uint32_t const slabs_ru = (count + hs->slab_keys - 1) / hs->slab_keys; in hs_cl_pad()
889 uint32_t const blocks = slabs_ru / hs->config.block.slabs; in hs_cl_pad()
890 uint32_t const block_slabs = blocks * hs->config.block.slabs; in hs_cl_pad()
892 uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs); in hs_cl_pad()
894 *count_padded_in = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys; in hs_cl_pad()
900 if (slabs_ru > hs->config.block.slabs) in hs_cl_pad()
904 uint32_t const block_slabs_lo = blocks_lo * hs->config.block.slabs; in hs_cl_pad()
912 … blocks_lo << (1 - hs->config.merge.fm.scale_min)); in hs_cl_pad()
917 *count_padded_out = block_slabs_padded_out * hs->slab_keys; in hs_cl_pad()
1079 struct hs_cl * hs = malloc(sizeof(*hs) + sizeof(cl_kernel) * count_all); in hs_cl_create() local
1081 memcpy(&hs->config,&target->config,sizeof(hs->config)); in hs_cl_create()
1084 hs->key_val_size = (target->config.words.key + target->config.words.val) * 4; in hs_cl_create()
1085 hs->slab_keys = target->config.slab.height << target->config.slab.width_log2; in hs_cl_create()
1086 hs->bs_slabs_log2_ru = bs_slabs_log2_ru; in hs_cl_create()
1087 hs->bc_slabs_log2_max = bc_slabs_log2_max; in hs_cl_create()
1090 hs->kernels.count = count_all; in hs_cl_create()
1095 cl_kernel * kernel_next = hs->kernels.all; in hs_cl_create()
1101 hs->kernels.bs = kernel_next; in hs_cl_create()
1117 hs->kernels.bc = kernel_next; in hs_cl_create()
1134 hs->kernels.fm[0] = kernel_next; in hs_cl_create()
1148 hs->kernels.fm[1] = kernel_next; in hs_cl_create()
1162 hs->kernels.fm[2] = kernel_next; in hs_cl_create()
1179 hs->kernels.hm[0] = kernel_next; in hs_cl_create()
1190 hs->kernels.hm[1] = kernel_next; in hs_cl_create()
1201 hs->kernels.hm[2] = kernel_next; in hs_cl_create()
1214 hs->kernels.transpose = kernel_next; in hs_cl_create()
1223 return hs; in hs_cl_create()
1231 hs_cl_release(struct hs_cl * const hs) in hs_cl_release() argument
1233 for (uint32_t ii=0; ii<hs->kernels.count; ii++) in hs_cl_release()
1234 cl(ReleaseKernel(hs->kernels.all[ii])); in hs_cl_release()
1236 free(hs); in hs_cl_release()