1 /* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9 #ifndef SKC_ONCE_DEVICE_CL_12_H 10 #define SKC_ONCE_DEVICE_CL_12_H 11 12 // 13 // FIXME -- THERE ARE SOME DUPLICATED TYPEDEFS IN THIS FILE 14 // 15 // THESE WILL GO AWAY AS THE TYPING GET POLISHED AND SIMPLIFIED 16 // 17 18 #include "block.h" 19 20 // 21 // HOW TO SELECT A SUBBLOCK AND BLOCK SIZES: 22 // 23 // 1) The subblock size should match the natural SIMT/SIMD width of 24 // the target device. 25 // 26 // 2) Either a square or rectangular (1:2) tile size is chosen. The 27 // tile size is usually determined by the amount of SMEM available 28 // to a render kernel subgroup and desired multiprocessor 29 // occupancy. 30 // 31 // 3) If the tile is rectangular then the block size must be at least 32 // twice the size of the subblock size. 33 // 34 // 4) A large block size can decrease allocation overhead but there 35 // will be diminishing returns as the block size increases. 36 // 37 38 #define SKC_DEVICE_BLOCK_WORDS_LOG2 6 // CHANGE "WORDS" TO "SIZE" ? 39 #define SKC_DEVICE_SUBBLOCK_WORDS_LOG2 3 40 41 #define SKC_TILE_WIDTH_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 42 #define SKC_TILE_HEIGHT_LOG2 (SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + 1) 43 44 ///////////////////////////////////////////////////////////////// 45 // 46 // BLOCK POOL INIT 47 // 48 49 #define SKC_BP_INIT_IDS_KERNEL_ATTRIBS 50 #define SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS __attribute__((reqd_work_group_size(2,1,1))) 51 52 ///////////////////////////////////////////////////////////////// 53 // 54 // PATHS ALLOC 55 // 56 57 #define SKC_PATHS_ALLOC_KERNEL_ATTRIBS __attribute__((reqd_work_group_size(1,1,1))) 58 59 ///////////////////////////////////////////////////////////////// 60 // 61 // PATHS COPY 62 // 63 64 #define SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK? 65 #define SKC_PATHS_COPY_ELEM_WORDS 1 66 #define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_1() 67 68 #define SKC_PATHS_COPY_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_PATHS_COPY_SUBGROUP_SIZE))) 69 70 #define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + get_sub_group_local_id() >= SKC_PATH_HEAD_WORDS) 71 72 typedef skc_uint skc_paths_copy_elem; 73 typedef skc_uint skc_pb_idx_v; 74 75 ///////////////////////////////////////////////////////////////// 76 // 77 // FILLS EXPAND 78 // 79 80 #define SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 81 #define SKC_FILLS_EXPAND_ELEM_WORDS 1 82 83 #define SKC_FILLS_EXPAND_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_FILLS_EXPAND_SUBGROUP_SIZE))) 84 85 ///////////////////////////////////////////////////////////////// 86 // 87 // RASTER ALLOC 88 // 89 // NOTE -- Intel subgroup shuffles aren't supported in SIMD32 which is 90 // why use of the subgroup broadcast produces a compiler error. So a 91 // subgroup of size 16 is this widest we can require. 92 // 93 94 #define SKC_RASTERS_ALLOC_GROUP_SIZE 16 95 96 #if (SKC_RASTERS_ALLOC_GROUP_SIZE <= 16) 97 98 #define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE))) 99 #define SKC_RASTERS_ALLOC_LOCAL_ID() get_sub_group_local_id() 100 #define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v) sub_group_scan_inclusive_add(v) 101 #define SKC_RASTERS_ALLOC_BROADCAST(v,i) sub_group_broadcast(v,i) 102 103 #else 104 105 #define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS __attribute__((reqd_work_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE,1,1))) 106 #define SKC_RASTERS_ALLOC_LOCAL_ID() get_local_id(0) 107 #define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v) work_group_scan_inclusive_add(v) 108 #define SKC_RASTERS_ALLOC_BROADCAST(v,i) work_group_broadcast(v,i) 109 110 #endif 111 112 ///////////////////////////////////////////////////////////////// 113 // 114 // RASTERIZE 115 // 116 117 #define SKC_RASTERIZE_SUBGROUP_SIZE SKC_DEVICE_SUBBLOCK_WORDS 118 #define SKC_RASTERIZE_VECTOR_SIZE_LOG2 0 119 #define SKC_RASTERIZE_WORKGROUP_SUBGROUPS 1 120 121 #define SKC_RASTERIZE_KERNEL_ATTRIBS \ 122 __attribute__((intel_reqd_sub_group_size(SKC_RASTERIZE_SUBGROUP_SIZE))) \ 123 __attribute__((reqd_work_group_size(SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_WORKGROUP_SUBGROUPS, 1, 1))) 124 125 #define SKC_RASTERIZE_FLOAT float 126 #define SKC_RASTERIZE_UINT uint 127 #define SKC_RASTERIZE_INT int 128 #define SKC_RASTERIZE_PREDICATE bool 129 #define SKC_RASTERIZE_POOL uint 130 131 #define SKC_RASTERIZE_TILE_HASH_X_BITS 1 132 #define SKC_RASTERIZE_TILE_HASH_Y_BITS 2 133 134 typedef skc_block_id_t skc_block_id_v_t; 135 typedef skc_uint2 skc_ttsk_v_t; 136 typedef skc_uint2 skc_ttsk_s_t; 137 138 // SKC_STATIC_ASSERT(SKC_RASTERIZE_POOL_SIZE > SKC_RASTERIZE_SUBGROUP_SIZE); 139 140 ///////////////////////////////////////////////////////////////// 141 // 142 // PREFIX 143 // 144 145 #define SKC_PREFIX_SUBGROUP_SIZE 8 // for now this had better be SKC_DEVICE_SUBBLOCK_WORDS 146 #define SKC_PREFIX_WORKGROUP_SUBGROUPS 1 147 148 #define SKC_PREFIX_KERNEL_ATTRIBS \ 149 __attribute__((intel_reqd_sub_group_size(SKC_PREFIX_SUBGROUP_SIZE))) \ 150 __attribute__((reqd_work_group_size(SKC_PREFIX_SUBGROUP_SIZE * SKC_PREFIX_WORKGROUP_SUBGROUPS, 1, 1))) 151 152 #define SKC_PREFIX_TTP_V skc_uint2 153 #define SKC_PREFIX_TTS_V_BITFIELD skc_int 154 155 #define SKC_PREFIX_TTS_VECTOR_INT_EXPAND SKC_EXPAND_1 156 157 #define SKC_PREFIX_SMEM_ZERO ulong 158 #define SKC_PREFIX_SMEM_ZERO_WIDTH (sizeof(SKC_PREFIX_SMEM_ZERO) / sizeof(skc_ttp_t)) 159 #define SKC_PREFIX_SMEM_COUNT_BLOCK_ID 8 160 161 #define SKC_PREFIX_BLOCK_ID_V_SIZE SKC_PREFIX_SUBGROUP_SIZE 162 163 #define SKC_PREFIX_TTXK_V_SIZE SKC_PREFIX_SUBGROUP_SIZE 164 #define SKC_PREFIX_TTXK_V_MASK (SKC_PREFIX_TTXK_V_SIZE - 1) 165 166 typedef skc_uint skc_bp_elem_t; 167 168 typedef skc_uint2 skc_ttrk_e_t; 169 typedef skc_uint2 skc_ttsk_v_t; 170 typedef skc_uint2 skc_ttsk_s_t; 171 typedef skc_uint2 skc_ttpk_s_t; 172 typedef skc_uint2 skc_ttxk_v_t; 173 174 typedef skc_int skc_tts_v_t; 175 176 typedef skc_int skc_ttp_t; 177 178 typedef skc_uint skc_raster_yx_s; 179 180 typedef skc_block_id_t skc_block_id_v_t; 181 typedef skc_block_id_t skc_block_id_s_t; 182 183 ///////////////////////////////////////////////////////////////// 184 // 185 // PLACE 186 // 187 188 #define SKC_PLACE_SUBGROUP_SIZE 16 189 #define SKC_PLACE_WORKGROUP_SUBGROUPS 1 190 191 #define SKC_PLACE_KERNEL_ATTRIBS \ 192 __attribute__((intel_reqd_sub_group_size(SKC_PLACE_SUBGROUP_SIZE))) \ 193 __attribute__((reqd_work_group_size(SKC_PLACE_SUBGROUP_SIZE * SKC_PLACE_WORKGROUP_SUBGROUPS, 1, 1))) 194 195 typedef skc_uint skc_bp_elem_t; 196 197 typedef skc_uint skc_ttsk_lo_t; 198 typedef skc_uint skc_ttsk_hi_t; 199 200 typedef skc_uint skc_ttpk_lo_t; 201 typedef skc_uint skc_ttpk_hi_t; 202 203 typedef skc_uint skc_ttxk_lo_t; 204 typedef skc_uint skc_ttxk_hi_t; 205 206 typedef skc_uint2 skc_ttck_t; 207 208 typedef skc_bool skc_pred_v_t; 209 typedef skc_int skc_int_v_t; 210 211 ///////////////////////////////////////////////////////////////// 212 // 213 // RENDER 214 // 215 216 #define SKC_ARCH_GEN9 217 218 #if defined(__OPENCL_C_VERSION__) 219 #pragma OPENCL EXTENSION cl_khr_fp16 : enable 220 #endif 221 222 #define SKC_RENDER_SUBGROUP_SIZE 8 223 #define SKC_RENDER_WORKGROUP_SUBGROUPS 1 224 225 #define SKC_RENDER_KERNEL_ATTRIBS \ 226 __attribute__((intel_reqd_sub_group_size(SKC_RENDER_SUBGROUP_SIZE))) \ 227 __attribute__((reqd_work_group_size(SKC_RENDER_SUBGROUP_SIZE * SKC_RENDER_WORKGROUP_SUBGROUPS, 1, 1))) 228 229 #define SKC_RENDER_SCANLINE_VECTOR_SIZE 2 230 231 #define SKC_RENDER_REGS_COLOR_R 2 232 #define SKC_RENDER_REGS_COVER_R 3 233 234 #define SKC_RENDER_TTSB_EXPAND() SKC_EXPAND_1() 235 236 #define SKC_RENDER_TTS_V skc_int 237 #define SKC_RENDER_TTS_V_BITFIELD skc_int 238 239 #define SKC_RENDER_TTP_V skc_int2 240 #define SKC_RENDER_AREA_V skc_int2 241 242 #define SKC_RENDER_TILE_COLOR_PAIR half2 243 #define SKC_RENDER_TILE_COLOR_PAIR_LOAD(x,v) vload2(x,v) 244 245 #define SKC_RENDER_SURFACE_COLOR half4 246 #define SKC_RENDER_SURFACE_WRITE write_imageh 247 248 // #define SKC_RENDER_TTXB_VECTOR_INT int2 249 // #define SKC_RENDER_TTXB_VECTOR_UINT uint2 250 251 #define SKC_RENDER_WIDE_AA ulong // SLM = 64 bytes/clock 252 253 #define SKC_RENDER_TILE_COLOR half2 254 #define SKC_RENDER_TILE_COVER half2 255 256 #define SKC_RENDER_ACC_COVER_INT int2 257 #define SKC_RENDER_ACC_COVER_UINT uint2 258 259 #define SKC_RENDER_GRADIENT_FLOAT float2 260 #define SKC_RENDER_GRADIENT_INT int2 261 #define SKC_RENDER_GRADIENT_STOP int2 262 #define SKC_RENDER_GRADIENT_FRAC half2 263 #define SKC_RENDER_GRADIENT_COLOR_STOP half 264 265 #define SKC_RENDER_SURFACE_U8_RGBA uint2 266 267 #define SKC_RENDER_TILE_COLOR_VECTOR uint16 268 #define SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT uint 269 #define SKC_RENDER_TILE_COLOR_VECTOR_COUNT ((sizeof(SKC_RENDER_TILE_COLOR) * 4 * SKC_TILE_WIDTH) / sizeof(SKC_RENDER_TILE_COLOR_VECTOR)) 270 271 ///////////////////////////////////////////////////////////////// 272 // 273 // PATHS & RASTERS RECLAIM 274 // 275 // FIXME -- investigate enabling the stride option for a smaller grid 276 // that iterates over a fixed number of threads. Since reclamation is 277 // a low-priority task, it's probably reasonable to trade longer 278 // reclamation times for lower occupancy of the device because it 279 // might delay the fastpath of the pipeline. 280 // 281 282 #define SKC_RECLAIM_ARRAY_SIZE (7 * 8 / 2) // 8 EUs with 7 hardware threads divided by 2 is half a sub-slice 283 284 ///////////////////////////////////////////////////////////////// 285 // 286 // PATHS RECLAIM 287 // 288 289 #define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK? 290 #define SKC_PATHS_RECLAIM_LOCAL_ELEMS 1 291 #define SKC_PATHS_RECLAIM_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_PATHS_RECLAIM_SUBGROUP_SIZE))) 292 293 ///////////////////////////////////////////////////////////////// 294 // 295 // RASTERS RECLAIM 296 // 297 298 #define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2 SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK? 299 #define SKC_RASTERS_RECLAIM_LOCAL_ELEMS 1 300 #define SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_RECLAIM_SUBGROUP_SIZE))) 301 302 // 303 // COMMON -- FIXME -- HOIST THESE ELSEWHERE 304 // 305 306 #define SKC_DEVICE_BLOCK_WORDS (1u << SKC_DEVICE_BLOCK_WORDS_LOG2) 307 #define SKC_DEVICE_SUBBLOCK_WORDS (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2) 308 309 #define SKC_DEVICE_BLOCK_DWORDS (SKC_DEVICE_BLOCK_WORDS / 2) 310 311 #define SKC_DEVICE_BLOCK_WORDS_MASK SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2) 312 #define SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2 - SKC_DEVICE_SUBBLOCK_WORDS_LOG2) 313 314 #define SKC_DEVICE_SUBBLOCKS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS) 315 316 #define SKC_TILE_RATIO (SKC_TILE_HEIGHT / SKC_TILE_WIDTH) 317 318 // 319 // 320 // 321 322 #define SKC_PATHS_COPY_SUBGROUP_SIZE (1 << SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2) 323 #define SKC_PATHS_RECLAIM_SUBGROUP_SIZE (1 << SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2) 324 #define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE (1 << SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2) 325 #define SKC_FILLS_EXPAND_SUBGROUP_SIZE (1 << SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2) 326 327 // 328 // 329 // 330 331 #endif 332 333 // 334 // 335 // 336