// // Copyright 2016 Google Inc. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // #ifndef HS_CUDA_MACROS_ONCE #define HS_CUDA_MACROS_ONCE // // // #ifdef __cplusplus extern "C" { #endif #include #ifdef __cplusplus } #endif // // Define the type based on key and val sizes // #if HS_KEY_WORDS == 1 #if HS_VAL_WORDS == 0 #define HS_KEY_TYPE uint32_t #endif #elif HS_KEY_WORDS == 2 #define HS_KEY_TYPE uint64_t #endif // // FYI, restrict shouldn't have any impact on these kernels and // benchmarks appear to prove that true // #define HS_RESTRICT __restrict__ // // // #define HS_SCOPE() \ static #define HS_KERNEL_QUALIFIER() \ __global__ void // // The sm_35 arch has a maximum of 16 blocks per multiprocessor. Just // clamp it to 16 when targeting this arch. // // This only arises when compiling the 32-bit sorting kernels. // // You can also generate a narrower 16-warp wide 32-bit sorting kernel // which is sometimes faster and sometimes slower than the 32-block // configuration. // #if ( __CUDA_ARCH__ == 350 ) #define HS_CUDA_MAX_BPM 16 #else #define HS_CUDA_MAX_BPM UINT32_MAX // 32 #endif #define HS_CLAMPED_BPM(min_bpm) \ ((min_bpm) < HS_CUDA_MAX_BPM ? (min_bpm) : HS_CUDA_MAX_BPM) // // // #define HS_LAUNCH_BOUNDS(max_tpb,min_bpm) \ __launch_bounds__(max_tpb,HS_CLAMPED_BPM(min_bpm)) // // KERNEL PROTOS // #define HS_BS_KERNEL_NAME(slab_count_ru_log2) \ hs_kernel_bs_##slab_count_ru_log2 #define HS_BS_KERNEL_PROTO(slab_count,slab_count_ru_log2) \ HS_SCOPE() \ HS_KERNEL_QUALIFIER() \ HS_LAUNCH_BOUNDS(HS_SLAB_THREADS*slab_count,1) \ HS_BS_KERNEL_NAME(slab_count_ru_log2)(HS_KEY_TYPE * const HS_RESTRICT vout, \ HS_KEY_TYPE const * const HS_RESTRICT vin) // #define HS_OFFSET_BS_KERNEL_NAME(slab_count_ru_log2) \ hs_kernel_bs_##slab_count_ru_log2 #define HS_OFFSET_BS_KERNEL_PROTO(slab_count,slab_count_ru_log2) \ HS_SCOPE() \ HS_KERNEL_QUALIFIER() \ HS_LAUNCH_BOUNDS(HS_SLAB_THREADS*slab_count,HS_BS_SLABS/(1<= b) { \ HS_KEY_TYPE const t = a; \ a = b; \ b = t; \ } // good #define HS_CMP_XCHG_V3(a,b) \ { \ int32_t const ge = a >= b; \ HS_KEY_TYPE const t = a; \ a = ge ? b : a; \ b = ge ? t : b; \ } // // // #if (HS_KEY_WORDS == 1) #define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b) #elif (HS_KEY_WORDS == 2) #define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b) #endif // // The flip/half comparisons rely on a "conditional min/max": // // - if the flag is false, return min(a,b) // - otherwise, return max(a,b) // // What's a little surprising is that sequence (1) is faster than (2) // for 32-bit keys. // // I suspect either a code generation problem or that the sequence // maps well to the GEN instruction set. // // We mostly care about 64-bit keys and unsurprisingly sequence (2) is // fastest for this wider type. // // this is what you would normally use #define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a // this seems to be faster for 32-bit keys #define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b)) // // // #if (HS_KEY_WORDS == 1) #define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b) #elif (HS_KEY_WORDS == 2) #define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b) #endif // // HotSort shuffles are always warp-wide // #define HS_SHFL_ALL 0xFFFFFFFF // // Conditional inter-subgroup flip/half compare exchange // #define HS_CMP_FLIP(i,a,b) \ { \ HS_KEY_TYPE const ta = __shfl_sync(HS_SHFL_ALL,a,flip_lane_idx); \ HS_KEY_TYPE const tb = __shfl_sync(HS_SHFL_ALL,b,flip_lane_idx); \ a = HS_COND_MIN_MAX(t_lt,a,tb); \ b = HS_COND_MIN_MAX(t_lt,b,ta); \ } #define HS_CMP_HALF(i,a) \ { \ HS_KEY_TYPE const ta = __shfl_sync(HS_SHFL_ALL,a,half_lane_idx); \ a = HS_COND_MIN_MAX(t_lt,a,ta); \ } // // The device's comparison operator might return what we actually // want. For example, it appears GEN 'cmp' returns {true:-1,false:0}. // #define HS_CMP_IS_ZERO_ONE #ifdef HS_CMP_IS_ZERO_ONE // OpenCL requires a {true: +1, false: 0} scalar result // (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF } #define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b)) #define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a) #else // However, OpenCL requires { -1, 0 } for vectors // (a < b) -> { 0xFFFFFFFF, 0 } #define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64 #define HS_CMP_TO_MASK(a) (a) #endif // // The "flip-merge" and "half-merge" preambles are very similar // // For now, we're only using the .y dimension for the span idx // #define HS_OFFSET_HM_PREAMBLE(half_span,span_offset) \ uint32_t const span_idx = span_offset + blockIdx.y; \ uint32_t const span_stride = HS_GLOBAL_SIZE_X(); \ uint32_t const span_size = span_stride * half_span * 2; \ uint32_t const span_base = span_idx * span_size; \ uint32_t const span_off = HS_GLOBAL_ID_X(); \ uint32_t const span_l = span_base + span_off #define HS_HM_PREAMBLE(half_span) \ HS_OFFSET_HM_PREAMBLE(half_span,0) \ #define HS_FM_PREAMBLE(half_span) \ HS_HM_PREAMBLE(half_span); \ uint32_t const span_r = span_base + span_stride * (half_span + 1) - span_off - 1 #define HS_OFFSET_FM_PREAMBLE(half_span) \ HS_OFFSET_HM_PREAMBLE(half_span,span_offset); \ uint32_t const span_r = span_base + span_stride * (half_span + 1) - span_off - 1 // // // #define HS_XM_GLOBAL_L(stride_idx) \ vout[span_l + span_stride * stride_idx] #define HS_XM_GLOBAL_LOAD_L(stride_idx) \ HS_XM_GLOBAL_L(stride_idx) #define HS_XM_GLOBAL_STORE_L(stride_idx,reg) \ HS_XM_GLOBAL_L(stride_idx) = reg #define HS_FM_GLOBAL_R(stride_idx) \ vout[span_r + span_stride * stride_idx] #define HS_FM_GLOBAL_LOAD_R(stride_idx) \ HS_FM_GLOBAL_R(stride_idx) #define HS_FM_GLOBAL_STORE_R(stride_idx,reg) \ HS_FM_GLOBAL_R(stride_idx) = reg // // This snarl of macros is for transposing a "slab" of sorted elements // into linear order. // // This can occur as the last step in hs_sort() or via a custom kernel // that inspects the slab and then transposes and stores it to memory. // // The slab format can be inspected more efficiently than a linear // arrangement. // // The prime example is detecting when adjacent keys (in sort order) // have differing high order bits ("key changes"). The index of each // change is recorded to an auxilary array. // // A post-processing step like this needs to be able to navigate the // slab and eventually transpose and store the slab in linear order. // #define HS_SUBGROUP_SHUFFLE_XOR(v,m) __shfl_xor_sync(HS_SHFL_ALL,v,m) #define HS_TRANSPOSE_REG(prefix,row) prefix##row #define HS_TRANSPOSE_DECL(prefix,row) HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row) #define HS_TRANSPOSE_PRED(level) is_lo_##level #define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) \ prefix_curr##row_ll##_##row_ur #define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) \ HS_KEY_TYPE const HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) #define HS_TRANSPOSE_STAGE(level) \ bool const HS_TRANSPOSE_PRED(level) = \ (HS_LANE_ID() & (1 << (level-1))) == 0; #define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \ HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) = \ HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ? \ HS_TRANSPOSE_REG(prefix_prev,row_ll) : \ HS_TRANSPOSE_REG(prefix_prev,row_ur), \ 1<<(level-1)); \ \ HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \ HS_TRANSPOSE_PRED(level) ? \ HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) : \ HS_TRANSPOSE_REG(prefix_prev,row_ll); \ \ HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \ HS_TRANSPOSE_PRED(level) ? \ HS_TRANSPOSE_REG(prefix_prev,row_ur) : \ HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur); #define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \ vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \ HS_TRANSPOSE_REG(prefix,row_from); // // // #endif // // //