1 /* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9 #pragma once 10 11 // 12 // TODO: 13 // 14 // Add Key-Val sorting support -- easy. 15 // 16 17 #include <stdio.h> 18 #include <stdint.h> 19 20 // 21 // All code generation is driven by the specified architectural 22 // details and host platform API. 23 // 24 // In general, the warps-per-block and keys-per-thread are the 25 // critical knobs for tuning performance. 26 // 27 28 struct hsg_config 29 { 30 struct { 31 32 struct { 33 uint32_t warps; 34 uint32_t lo; 35 uint32_t hi; 36 } flip; 37 38 struct { 39 uint32_t warps; 40 uint32_t lo; 41 uint32_t hi; 42 } half; 43 44 uint32_t max_log2; 45 46 } merge; 47 48 struct { 49 uint32_t warps_min; 50 uint32_t warps_max; 51 uint32_t warps_mod; 52 53 uint32_t smem_min; 54 uint32_t smem_quantum; 55 56 uint32_t smem_bs; 57 uint32_t smem_bc; 58 } block; 59 60 struct { 61 uint32_t lanes; 62 uint32_t lanes_log2; 63 uint32_t skpw_bs; 64 } warp; 65 66 struct { 67 uint32_t regs; 68 uint32_t xtra; 69 } thread; 70 71 struct { 72 uint32_t words; 73 } type; 74 }; 75 76 // 77 // HotSort can merge non-power-of-two blocks of warps 78 // 79 80 struct hsg_level 81 { 82 uint32_t count; // networks >= 2 83 84 uint32_t diffs [2]; 85 uint32_t diff_masks [2]; 86 uint32_t evenodds [2]; 87 uint32_t evenodd_masks[2]; 88 uint32_t networks [2]; 89 90 union { 91 uint64_t b64; 92 uint32_t b32a2[2]; 93 } active; 94 }; 95 96 // 97 // 98 // 99 100 #define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps 101 #define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2) 102 103 // 104 // This is computed 105 // 106 107 struct hsg_merge 108 { 109 uint32_t offsets [MERGE_LEVELS_MAX_SIZE]; 110 uint32_t networks[MERGE_LEVELS_MAX_SIZE]; 111 112 struct hsg_level levels[MERGE_LEVELS_MAX_LOG2]; 113 114 uint32_t index; 115 116 uint32_t warps; 117 118 uint32_t rows_bs; 119 uint32_t rows_bc; 120 121 uint32_t skpw_bc; 122 }; 123 124 // 125 // 126 // 127 128 #if 0 129 130 #define HSG_FILE_NAME_SIZE 80 131 132 struct hsg_file 133 { 134 FILE * file; 135 char const * prefix; 136 char name[HSG_FILE_NAME_SIZE]; 137 }; 138 139 // 140 // 141 // 142 143 typedef enum hsg_file_type { 144 145 HSG_FILE_TYPE_HEADER, 146 HSG_FILE_TYPE_SOURCE, 147 148 HSG_FILE_TYPE_COUNT 149 150 } hsg_file_type; 151 152 #endif 153 154 // 155 // 156 // 157 158 #define HSG_OP_EXPAND_ALL() \ 159 HSG_OP_EXPAND_X(HSG_OP_TYPE_EXIT) \ 160 \ 161 HSG_OP_EXPAND_X(HSG_OP_TYPE_END) \ 162 HSG_OP_EXPAND_X(HSG_OP_TYPE_BEGIN) \ 163 HSG_OP_EXPAND_X(HSG_OP_TYPE_ELSE) \ 164 \ 165 HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_BEGIN) \ 166 HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_END) \ 167 \ 168 HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO) \ 169 HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE) \ 170 HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY) \ 171 \ 172 HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_KERNEL_PROTO) \ 173 HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_KERNEL_PREAMBLE) \ 174 \ 175 HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_KERNEL_PROTO) \ 176 HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_KERNEL_PREAMBLE) \ 177 \ 178 HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_KERNEL_PROTO) \ 179 HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_KERNEL_PREAMBLE) \ 180 \ 181 HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_KERNEL_PROTO) \ 182 HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_KERNEL_PREAMBLE) \ 183 \ 184 HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_GLOBAL_LOAD) \ 185 HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_GLOBAL_STORE) \ 186 \ 187 HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT) \ 188 HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT) \ 189 HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT) \ 190 HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT) \ 191 HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \ 192 \ 193 HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_LOAD) \ 194 HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_STORE) \ 195 \ 196 HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_FLIP) \ 197 HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_HALF) \ 198 \ 199 HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_FLIP) \ 200 HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_HALF) \ 201 \ 202 HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_XCHG) \ 203 \ 204 HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_STORE_V) \ 205 HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_V) \ 206 HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_REG_SHARED_LOAD_V) \ 207 \ 208 HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT) \ 209 HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT) \ 210 \ 211 HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT) \ 212 HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT) \ 213 \ 214 HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT) \ 215 \ 216 HSG_OP_EXPAND_X(HSG_OP_TYPE_BLOCK_SYNC) \ 217 \ 218 HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_FRAC_PRED) \ 219 \ 220 HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_MERGE_H_PREAMBLE) \ 221 HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_MERGE_H_PREAMBLE) \ 222 \ 223 HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_MERGE_H_PRED) \ 224 \ 225 HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_ACTIVE_PRED) \ 226 \ 227 HSG_OP_EXPAND_X(HSG_OP_TYPE_COUNT) 228 229 // 230 // 231 // 232 233 #undef HSG_OP_EXPAND_X 234 #define HSG_OP_EXPAND_X(t) t , 235 236 typedef enum hsg_op_type { 237 238 HSG_OP_EXPAND_ALL() 239 240 } hsg_op_type; 241 242 // 243 // 244 // 245 246 struct hsg_op 247 { 248 hsg_op_type type; 249 250 union { 251 252 struct { 253 int32_t a; 254 int32_t b; 255 int32_t c; 256 }; 257 258 struct { 259 int32_t n; 260 int32_t v; 261 }; 262 263 struct { 264 int32_t m; 265 int32_t w; 266 }; 267 268 }; 269 }; 270 271 // 272 // 273 // 274 275 extern char const * const hsg_op_type_string[]; 276 277 // 278 // 279 // 280 281 struct hsg_target 282 { 283 char const * define; 284 struct hsg_target_state * state; 285 }; 286 287 // 288 // All targets share this prototype 289 // 290 291 typedef 292 void 293 (*hsg_target_pfn)(struct hsg_target * const target, 294 struct hsg_config const * const config, 295 struct hsg_merge const * const merge, 296 struct hsg_op const * const ops, 297 uint32_t const depth); 298 // 299 // 300 // 301 302 extern 303 void 304 hsg_target_debug(struct hsg_target * const target, 305 struct hsg_config const * const config, 306 struct hsg_merge const * const merge, 307 struct hsg_op const * const ops, 308 uint32_t const depth); 309 310 extern 311 void 312 hsg_target_cuda(struct hsg_target * const target, 313 struct hsg_config const * const config, 314 struct hsg_merge const * const merge, 315 struct hsg_op const * const ops, 316 uint32_t const depth); 317 318 extern 319 void 320 hsg_target_opencl(struct hsg_target * const target, 321 struct hsg_config const * const config, 322 struct hsg_merge const * const merge, 323 struct hsg_op const * const ops, 324 uint32_t const depth); 325 326 extern 327 void 328 hsg_target_glsl(struct hsg_target * const target, 329 struct hsg_config const * const config, 330 struct hsg_merge const * const merge, 331 struct hsg_op const * const ops, 332 uint32_t const depth); 333 // 334 // 335 // 336