1 /* 2 * Copyright © 2013-2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include "isl/isl.h" 25 #include "brw_fs_surface_builder.h" 26 #include "brw_fs.h" 27 28 using namespace brw; 29 30 namespace brw { 31 namespace surface_access { 32 namespace { 33 /** 34 * Generate a logical send opcode for a surface message and return 35 * the result. 36 */ 37 fs_reg emit_send(const fs_builder & bld,enum opcode opcode,const fs_reg & addr,const fs_reg & src,const fs_reg & surface,unsigned dims,unsigned arg,unsigned rsize,brw_predicate pred=BRW_PREDICATE_NONE)38 emit_send(const fs_builder &bld, enum opcode opcode, 39 const fs_reg &addr, const fs_reg &src, const fs_reg &surface, 40 unsigned dims, unsigned arg, unsigned rsize, 41 brw_predicate pred = BRW_PREDICATE_NONE) 42 { 43 /* Reduce the dynamically uniform surface index to a single 44 * scalar. 45 */ 46 const fs_reg usurface = bld.emit_uniformize(surface); 47 const fs_reg srcs[] = { 48 addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg) 49 }; 50 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize); 51 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); 52 53 inst->size_written = rsize * dst.component_size(inst->exec_size); 54 inst->predicate = pred; 55 return dst; 56 } 57 } 58 59 /** 60 * Emit an untyped surface read opcode. \p dims determines the number 61 * of components of the address and \p size the number of components of 62 * the returned value. 63 */ 64 fs_reg emit_untyped_read(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,unsigned dims,unsigned size,brw_predicate pred)65 emit_untyped_read(const fs_builder &bld, 66 const fs_reg &surface, const fs_reg &addr, 67 unsigned dims, unsigned size, 68 brw_predicate pred) 69 { 70 return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 71 addr, fs_reg(), surface, dims, size, size, pred); 72 } 73 74 /** 75 * Emit an untyped surface write opcode. \p dims determines the number 76 * of components of the address and \p size the number of components of 77 * the argument. 78 */ 79 void emit_untyped_write(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src,unsigned dims,unsigned size,brw_predicate pred)80 emit_untyped_write(const fs_builder &bld, const fs_reg &surface, 81 const fs_reg &addr, const fs_reg &src, 82 unsigned dims, unsigned size, 83 brw_predicate pred) 84 { 85 emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 86 addr, src, surface, dims, size, 0, pred); 87 } 88 89 /** 90 * Emit an untyped surface atomic opcode. \p dims determines the number 91 * of components of the address and \p rsize the number of components of 92 * the returned value (either zero or one). 93 */ 94 fs_reg emit_untyped_atomic(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned dims,unsigned rsize,unsigned op,brw_predicate pred)95 emit_untyped_atomic(const fs_builder &bld, 96 const fs_reg &surface, const fs_reg &addr, 97 const fs_reg &src0, const fs_reg &src1, 98 unsigned dims, unsigned rsize, unsigned op, 99 brw_predicate pred) 100 { 101 /* FINISHME: Factor out this frequently recurring pattern into a 102 * helper function. 103 */ 104 const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE); 105 const fs_reg srcs[] = { src0, src1 }; 106 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n); 107 bld.LOAD_PAYLOAD(tmp, srcs, n, 0); 108 109 return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 110 addr, tmp, surface, dims, op, rsize, pred); 111 } 112 113 /** 114 * Emit a typed surface read opcode. \p dims determines the number of 115 * components of the address and \p size the number of components of the 116 * returned value. 117 */ 118 fs_reg emit_typed_read(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,unsigned dims,unsigned size)119 emit_typed_read(const fs_builder &bld, const fs_reg &surface, 120 const fs_reg &addr, unsigned dims, unsigned size) 121 { 122 return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, 123 addr, fs_reg(), surface, dims, size, size); 124 } 125 126 /** 127 * Emit a typed surface write opcode. \p dims determines the number of 128 * components of the address and \p size the number of components of the 129 * argument. 130 */ 131 void emit_typed_write(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src,unsigned dims,unsigned size)132 emit_typed_write(const fs_builder &bld, const fs_reg &surface, 133 const fs_reg &addr, const fs_reg &src, 134 unsigned dims, unsigned size) 135 { 136 emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, 137 addr, src, surface, dims, size, 0); 138 } 139 140 /** 141 * Emit a typed surface atomic opcode. \p dims determines the number of 142 * components of the address and \p rsize the number of components of 143 * the returned value (either zero or one). 144 */ 145 fs_reg emit_typed_atomic(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned dims,unsigned rsize,unsigned op,brw_predicate pred)146 emit_typed_atomic(const fs_builder &bld, const fs_reg &surface, 147 const fs_reg &addr, 148 const fs_reg &src0, const fs_reg &src1, 149 unsigned dims, unsigned rsize, unsigned op, 150 brw_predicate pred) 151 { 152 /* FINISHME: Factor out this frequently recurring pattern into a 153 * helper function. 154 */ 155 const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE); 156 const fs_reg srcs[] = { src0, src1 }; 157 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n); 158 bld.LOAD_PAYLOAD(tmp, srcs, n, 0); 159 160 return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, 161 addr, tmp, surface, dims, op, rsize); 162 } 163 } 164 } 165 166 namespace { 167 namespace image_format_info { 168 /* The higher compiler layers use the GL enums for image formats even if 169 * they come in from SPIR-V or Vulkan. We need to turn them into an ISL 170 * enum before we can use them. 171 */ 172 enum isl_format isl_format_for_gl_format(uint32_t gl_format)173 isl_format_for_gl_format(uint32_t gl_format) 174 { 175 switch (gl_format) { 176 case GL_R8: return ISL_FORMAT_R8_UNORM; 177 case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM; 178 case GL_R8UI: return ISL_FORMAT_R8_UINT; 179 case GL_R8I: return ISL_FORMAT_R8_SINT; 180 case GL_RG8: return ISL_FORMAT_R8G8_UNORM; 181 case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM; 182 case GL_RG8UI: return ISL_FORMAT_R8G8_UINT; 183 case GL_RG8I: return ISL_FORMAT_R8G8_SINT; 184 case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM; 185 case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM; 186 case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT; 187 case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT; 188 case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT; 189 case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM; 190 case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT; 191 case GL_R16: return ISL_FORMAT_R16_UNORM; 192 case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM; 193 case GL_R16F: return ISL_FORMAT_R16_FLOAT; 194 case GL_R16UI: return ISL_FORMAT_R16_UINT; 195 case GL_R16I: return ISL_FORMAT_R16_SINT; 196 case GL_RG16: return ISL_FORMAT_R16G16_UNORM; 197 case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM; 198 case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT; 199 case GL_RG16UI: return ISL_FORMAT_R16G16_UINT; 200 case GL_RG16I: return ISL_FORMAT_R16G16_SINT; 201 case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM; 202 case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM; 203 case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT; 204 case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT; 205 case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT; 206 case GL_R32F: return ISL_FORMAT_R32_FLOAT; 207 case GL_R32UI: return ISL_FORMAT_R32_UINT; 208 case GL_R32I: return ISL_FORMAT_R32_SINT; 209 case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT; 210 case GL_RG32UI: return ISL_FORMAT_R32G32_UINT; 211 case GL_RG32I: return ISL_FORMAT_R32G32_SINT; 212 case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT; 213 case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT; 214 case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT; 215 case GL_NONE: return ISL_FORMAT_UNSUPPORTED; 216 default: 217 assert(!"Invalid image format"); 218 return ISL_FORMAT_UNSUPPORTED; 219 } 220 } 221 222 /** 223 * Simple 4-tuple of scalars used to pass around per-color component 224 * values. 225 */ 226 struct color_u { color_u__anon9649c4820211::image_format_info::color_u227 color_u(unsigned x = 0) : r(x), g(x), b(x), a(x) 228 { 229 } 230 color_u__anon9649c4820211::image_format_info::color_u231 color_u(unsigned r, unsigned g, unsigned b, unsigned a) : 232 r(r), g(g), b(b), a(a) 233 { 234 } 235 236 unsigned operator []__anon9649c4820211::image_format_info::color_u237 operator[](unsigned i) const 238 { 239 const unsigned xs[] = { r, g, b, a }; 240 return xs[i]; 241 } 242 243 unsigned r, g, b, a; 244 }; 245 246 /** 247 * Return the per-channel bitfield widths for a given image format. 248 */ 249 inline color_u get_bit_widths(isl_format format)250 get_bit_widths(isl_format format) 251 { 252 const isl_format_layout *fmtl = isl_format_get_layout(format); 253 254 return color_u(fmtl->channels.r.bits, 255 fmtl->channels.g.bits, 256 fmtl->channels.b.bits, 257 fmtl->channels.a.bits); 258 } 259 260 /** 261 * Return the per-channel bitfield shifts for a given image format. 262 */ 263 inline color_u get_bit_shifts(isl_format format)264 get_bit_shifts(isl_format format) 265 { 266 const color_u widths = get_bit_widths(format); 267 return color_u(0, widths.r, widths.r + widths.g, 268 widths.r + widths.g + widths.b); 269 } 270 271 /** 272 * Return true if all present components have the same bit width. 273 */ 274 inline bool is_homogeneous(isl_format format)275 is_homogeneous(isl_format format) 276 { 277 const color_u widths = get_bit_widths(format); 278 return ((widths.g == 0 || widths.g == widths.r) && 279 (widths.b == 0 || widths.b == widths.r) && 280 (widths.a == 0 || widths.a == widths.r)); 281 } 282 283 /** 284 * Return true if the format conversion boils down to a trivial copy. 285 */ 286 inline bool is_conversion_trivial(const gen_device_info * devinfo,isl_format format)287 is_conversion_trivial(const gen_device_info *devinfo, isl_format format) 288 { 289 return (get_bit_widths(format).r == 32 && is_homogeneous(format)) || 290 format == isl_lower_storage_image_format(devinfo, format); 291 } 292 293 /** 294 * Return true if the hardware natively supports some format with 295 * compatible bitfield layout, but possibly different data types. 296 */ 297 inline bool has_supported_bit_layout(const gen_device_info * devinfo,isl_format format)298 has_supported_bit_layout(const gen_device_info *devinfo, 299 isl_format format) 300 { 301 const color_u widths = get_bit_widths(format); 302 const color_u lower_widths = get_bit_widths( 303 isl_lower_storage_image_format(devinfo, format)); 304 305 return (widths.r == lower_widths.r && 306 widths.g == lower_widths.g && 307 widths.b == lower_widths.b && 308 widths.a == lower_widths.a); 309 } 310 311 /** 312 * Return true if we are required to spread individual components over 313 * several components of the format used by the hardware (RG32 and 314 * friends implemented as RGBA16UI). 315 */ 316 inline bool has_split_bit_layout(const gen_device_info * devinfo,isl_format format)317 has_split_bit_layout(const gen_device_info *devinfo, isl_format format) 318 { 319 const isl_format lower_format = 320 isl_lower_storage_image_format(devinfo, format); 321 322 return (isl_format_get_num_channels(format) < 323 isl_format_get_num_channels(lower_format)); 324 } 325 326 /** 327 * Return true if the hardware returns garbage in the unused high bits 328 * of each component. This may happen on IVB because we rely on the 329 * undocumented behavior that typed reads from surfaces of the 330 * unsupported R8 and R16 formats return useful data in their least 331 * significant bits. 332 */ 333 inline bool has_undefined_high_bits(const gen_device_info * devinfo,isl_format format)334 has_undefined_high_bits(const gen_device_info *devinfo, 335 isl_format format) 336 { 337 const isl_format lower_format = 338 isl_lower_storage_image_format(devinfo, format); 339 340 return (devinfo->gen == 7 && !devinfo->is_haswell && 341 (lower_format == ISL_FORMAT_R16_UINT || 342 lower_format == ISL_FORMAT_R8_UINT)); 343 } 344 345 /** 346 * Return true if the format represents values as signed integers 347 * requiring sign extension when unpacking. 348 */ 349 inline bool needs_sign_extension(isl_format format)350 needs_sign_extension(isl_format format) 351 { 352 return isl_format_has_snorm_channel(format) || 353 isl_format_has_sint_channel(format); 354 } 355 } 356 357 namespace image_validity { 358 /** 359 * Check whether the bound image is suitable for untyped access. 360 */ 361 brw_predicate emit_untyped_image_check(const fs_builder & bld,const fs_reg & image,brw_predicate pred)362 emit_untyped_image_check(const fs_builder &bld, const fs_reg &image, 363 brw_predicate pred) 364 { 365 const gen_device_info *devinfo = bld.shader->devinfo; 366 const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET); 367 368 if (devinfo->gen == 7 && !devinfo->is_haswell) { 369 /* Check whether the first stride component (i.e. the Bpp value) 370 * is greater than four, what on Gen7 indicates that a surface of 371 * type RAW has been bound for untyped access. Reading or writing 372 * to a surface of type other than RAW using untyped surface 373 * messages causes a hang on IVB and VLV. 374 */ 375 set_predicate(pred, 376 bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4), 377 BRW_CONDITIONAL_G)); 378 379 return BRW_PREDICATE_NORMAL; 380 } else { 381 /* More recent generations handle the format mismatch 382 * gracefully. 383 */ 384 return pred; 385 } 386 } 387 388 /** 389 * Check whether there is an image bound at the given index and write 390 * the comparison result to f0.0. Returns an appropriate predication 391 * mode to use on subsequent image operations. 392 */ 393 brw_predicate emit_typed_atomic_check(const fs_builder & bld,const fs_reg & image)394 emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image) 395 { 396 const gen_device_info *devinfo = bld.shader->devinfo; 397 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); 398 399 if (devinfo->gen == 7 && !devinfo->is_haswell) { 400 /* Check the first component of the size field to find out if the 401 * image is bound. Necessary on IVB for typed atomics because 402 * they don't seem to respect null surfaces and will happily 403 * corrupt or read random memory when no image is bound. 404 */ 405 bld.CMP(bld.null_reg_ud(), 406 retype(size, BRW_REGISTER_TYPE_UD), 407 brw_imm_d(0), BRW_CONDITIONAL_NZ); 408 409 return BRW_PREDICATE_NORMAL; 410 } else { 411 /* More recent platforms implement compliant behavior when a null 412 * surface is bound. 413 */ 414 return BRW_PREDICATE_NONE; 415 } 416 } 417 418 /** 419 * Check whether the provided coordinates are within the image bounds 420 * and write the comparison result to f0.0. Returns an appropriate 421 * predication mode to use on subsequent image operations. 422 */ 423 brw_predicate emit_bounds_check(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,unsigned dims)424 emit_bounds_check(const fs_builder &bld, const fs_reg &image, 425 const fs_reg &addr, unsigned dims) 426 { 427 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); 428 429 for (unsigned c = 0; c < dims; ++c) 430 set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL, 431 bld.CMP(bld.null_reg_ud(), 432 offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c), 433 offset(size, bld, c), 434 BRW_CONDITIONAL_L)); 435 436 return BRW_PREDICATE_NORMAL; 437 } 438 } 439 440 namespace image_coordinates { 441 /** 442 * Return the total number of coordinates needed to address a texel of 443 * the surface, which may be more than the sum of \p surf_dims and \p 444 * arr_dims if padding is required. 445 */ 446 unsigned num_image_coordinates(const fs_builder & bld,unsigned surf_dims,unsigned arr_dims,isl_format format)447 num_image_coordinates(const fs_builder &bld, 448 unsigned surf_dims, unsigned arr_dims, 449 isl_format format) 450 { 451 /* HSW in vec4 mode and our software coordinate handling for untyped 452 * reads want the array index to be at the Z component. 453 */ 454 const bool array_index_at_z = 455 format != ISL_FORMAT_UNSUPPORTED && 456 !isl_has_matching_typed_storage_image_format( 457 bld.shader->devinfo, format); 458 const unsigned zero_dims = 459 ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0); 460 461 return surf_dims + zero_dims + arr_dims; 462 } 463 464 /** 465 * Transform image coordinates into the form expected by the 466 * implementation. 467 */ 468 fs_reg emit_image_coordinates(const fs_builder & bld,const fs_reg & addr,unsigned surf_dims,unsigned arr_dims,isl_format format)469 emit_image_coordinates(const fs_builder &bld, const fs_reg &addr, 470 unsigned surf_dims, unsigned arr_dims, 471 isl_format format) 472 { 473 const unsigned dims = 474 num_image_coordinates(bld, surf_dims, arr_dims, format); 475 476 if (dims > surf_dims + arr_dims) { 477 assert(surf_dims == 1 && arr_dims == 1 && dims == 3); 478 /* The array index is required to be passed in as the Z component, 479 * insert a zero at the Y component to shift it to the right 480 * position. 481 * 482 * FINISHME: Factor out this frequently recurring pattern into a 483 * helper function. 484 */ 485 const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) }; 486 const fs_reg dst = bld.vgrf(addr.type, dims); 487 bld.LOAD_PAYLOAD(dst, srcs, dims, 0); 488 return dst; 489 } else { 490 return addr; 491 } 492 } 493 494 /** 495 * Calculate the offset in memory of the texel given by \p coord. 496 * 497 * This is meant to be used with untyped surface messages to access a 498 * tiled surface, what involves taking into account the tiling and 499 * swizzling modes of the surface manually so it will hopefully not 500 * happen very often. 501 * 502 * The tiling algorithm implemented here matches either the X or Y 503 * tiling layouts supported by the hardware depending on the tiling 504 * coefficients passed to the program as uniforms. See Volume 1 Part 2 505 * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth 506 * explanation of the hardware tiling format. 507 */ 508 fs_reg emit_address_calculation(const fs_builder & bld,const fs_reg & image,const fs_reg & coord,unsigned dims)509 emit_address_calculation(const fs_builder &bld, const fs_reg &image, 510 const fs_reg &coord, unsigned dims) 511 { 512 const gen_device_info *devinfo = bld.shader->devinfo; 513 const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET); 514 const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET); 515 const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET); 516 const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET); 517 const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 518 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 519 const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 520 const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 521 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD); 522 523 /* Shift the coordinates by the fixed surface offset. It may be 524 * non-zero if the image is a single slice of a higher-dimensional 525 * surface, or if a non-zero mipmap level of the surface is bound to 526 * the pipeline. The offset needs to be applied here rather than at 527 * surface state set-up time because the desired slice-level may 528 * start mid-tile, so simply shifting the surface base address 529 * wouldn't give a well-formed tiled surface in the general case. 530 */ 531 for (unsigned c = 0; c < 2; ++c) 532 bld.ADD(offset(addr, bld, c), offset(off, bld, c), 533 (c < dims ? 534 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) : 535 fs_reg(brw_imm_d(0)))); 536 537 /* The layout of 3-D textures in memory is sort-of like a tiling 538 * format. At each miplevel, the slices are arranged in rows of 539 * 2^level slices per row. The slice row is stored in tmp.y and 540 * the slice within the row is stored in tmp.x. 541 * 542 * The layout of 2-D array textures and cubemaps is much simpler: 543 * Depending on whether the ARYSPC_LOD0 layout is in use it will be 544 * stored in memory as an array of slices, each one being a 2-D 545 * arrangement of miplevels, or as a 2D arrangement of miplevels, 546 * each one being an array of slices. In either case the separation 547 * between slices of the same LOD is equal to the qpitch value 548 * provided as stride.w. 549 * 550 * This code can be made to handle either 2D arrays and 3D textures 551 * by passing in the miplevel as tile.z for 3-D textures and 0 in 552 * tile.z for 2-D array textures. 553 * 554 * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface 555 * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion 556 * of the hardware 3D texture and 2D array layouts. 557 */ 558 if (dims > 2) { 559 /* Decompose z into a major (tmp.y) and a minor (tmp.x) 560 * index. 561 */ 562 bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0), 563 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2)); 564 bld.SHR(offset(tmp, bld, 1), 565 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2), 566 offset(tile, bld, 2)); 567 568 /* Take into account the horizontal (tmp.x) and vertical (tmp.y) 569 * slice offset. 570 */ 571 for (unsigned c = 0; c < 2; ++c) { 572 bld.MUL(offset(tmp, bld, c), 573 offset(stride, bld, 2 + c), offset(tmp, bld, c)); 574 bld.ADD(offset(addr, bld, c), 575 offset(addr, bld, c), offset(tmp, bld, c)); 576 } 577 } 578 579 if (dims > 1) { 580 /* Calculate the major/minor x and y indices. In order to 581 * accommodate both X and Y tiling, the Y-major tiling format is 582 * treated as being a bunch of narrow X-tiles placed next to each 583 * other. This means that the tile width for Y-tiling is actually 584 * the width of one sub-column of the Y-major tile where each 4K 585 * tile has 8 512B sub-columns. 586 * 587 * The major Y value is the row of tiles in which the pixel lives. 588 * The major X value is the tile sub-column in which the pixel 589 * lives; for X tiling, this is the same as the tile column, for Y 590 * tiling, each tile has 8 sub-columns. The minor X and Y indices 591 * are the position within the sub-column. 592 */ 593 for (unsigned c = 0; c < 2; ++c) { 594 /* Calculate the minor x and y indices. */ 595 bld.BFE(offset(minor, bld, c), offset(tile, bld, c), 596 brw_imm_d(0), offset(addr, bld, c)); 597 598 /* Calculate the major x and y indices. */ 599 bld.SHR(offset(major, bld, c), 600 offset(addr, bld, c), offset(tile, bld, c)); 601 } 602 603 /* Calculate the texel index from the start of the tile row and 604 * the vertical coordinate of the row. 605 * Equivalent to: 606 * tmp.x = (major.x << tile.y << tile.x) + 607 * (minor.y << tile.x) + minor.x 608 * tmp.y = major.y << tile.y 609 */ 610 bld.SHL(tmp, major, offset(tile, bld, 1)); 611 bld.ADD(tmp, tmp, offset(minor, bld, 1)); 612 bld.SHL(tmp, tmp, offset(tile, bld, 0)); 613 bld.ADD(tmp, tmp, minor); 614 bld.SHL(offset(tmp, bld, 1), 615 offset(major, bld, 1), offset(tile, bld, 1)); 616 617 /* Add it to the start of the tile row. */ 618 bld.MUL(offset(tmp, bld, 1), 619 offset(tmp, bld, 1), offset(stride, bld, 1)); 620 bld.ADD(tmp, tmp, offset(tmp, bld, 1)); 621 622 /* Multiply by the Bpp value. */ 623 bld.MUL(dst, tmp, stride); 624 625 if (devinfo->gen < 8 && !devinfo->is_baytrail) { 626 /* Take into account the two dynamically specified shifts. 627 * Both need are used to implement swizzling of X-tiled 628 * surfaces. For Y-tiled surfaces only one bit needs to be 629 * XOR-ed with bit 6 of the memory address, so a swz value of 630 * 0xff (actually interpreted as 31 by the hardware) will be 631 * provided to cause the relevant bit of tmp.y to be zero and 632 * turn the first XOR into the identity. For linear surfaces 633 * or platforms lacking address swizzling both shifts will be 634 * 0xff causing the relevant bits of both tmp.x and .y to be 635 * zero, what effectively disables swizzling. 636 */ 637 for (unsigned c = 0; c < 2; ++c) 638 bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c)); 639 640 /* XOR tmp.x and tmp.y with bit 6 of the memory address. */ 641 bld.XOR(tmp, tmp, offset(tmp, bld, 1)); 642 bld.AND(tmp, tmp, brw_imm_d(1 << 6)); 643 bld.XOR(dst, dst, tmp); 644 } 645 646 } else { 647 /* Multiply by the Bpp/stride value. Note that the addr.y may be 648 * non-zero even if the image is one-dimensional because a 649 * vertical offset may have been applied above to select a 650 * non-zero slice or level of a higher-dimensional texture. 651 */ 652 bld.MUL(offset(addr, bld, 1), 653 offset(addr, bld, 1), offset(stride, bld, 1)); 654 bld.ADD(addr, addr, offset(addr, bld, 1)); 655 bld.MUL(dst, addr, stride); 656 } 657 658 return dst; 659 } 660 } 661 662 namespace image_format_conversion { 663 using image_format_info::color_u; 664 665 namespace { 666 /** 667 * Maximum representable value in an unsigned integer with the given 668 * number of bits. 669 */ 670 inline unsigned scale(unsigned n)671 scale(unsigned n) 672 { 673 return (1 << n) - 1; 674 } 675 } 676 677 /** 678 * Pack the vector \p src in a bitfield given the per-component bit 679 * shifts and widths. Note that bitfield components are not allowed to 680 * cross 32-bit boundaries. 681 */ 682 fs_reg emit_pack(const fs_builder & bld,const fs_reg & src,const color_u & shifts,const color_u & widths)683 emit_pack(const fs_builder &bld, const fs_reg &src, 684 const color_u &shifts, const color_u &widths) 685 { 686 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); 687 bool seen[4] = {}; 688 689 for (unsigned c = 0; c < 4; ++c) { 690 if (widths[c]) { 691 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 692 693 /* Shift each component left to the correct bitfield position. */ 694 bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32)); 695 696 /* Add everything up. */ 697 if (seen[shifts[c] / 32]) { 698 bld.OR(offset(dst, bld, shifts[c] / 32), 699 offset(dst, bld, shifts[c] / 32), tmp); 700 } else { 701 bld.MOV(offset(dst, bld, shifts[c] / 32), tmp); 702 seen[shifts[c] / 32] = true; 703 } 704 } 705 } 706 707 return dst; 708 } 709 710 /** 711 * Unpack a vector from the bitfield \p src given the per-component bit 712 * shifts and widths. Note that bitfield components are not allowed to 713 * cross 32-bit boundaries. 714 */ 715 fs_reg emit_unpack(const fs_builder & bld,const fs_reg & src,const color_u & shifts,const color_u & widths)716 emit_unpack(const fs_builder &bld, const fs_reg &src, 717 const color_u &shifts, const color_u &widths) 718 { 719 const fs_reg dst = bld.vgrf(src.type, 4); 720 721 for (unsigned c = 0; c < 4; ++c) { 722 if (widths[c]) { 723 /* Shift left to discard the most significant bits. */ 724 bld.SHL(offset(dst, bld, c), 725 offset(src, bld, shifts[c] / 32), 726 brw_imm_ud(32 - shifts[c] % 32 - widths[c])); 727 728 /* Shift back to the least significant bits using an arithmetic 729 * shift to get sign extension on signed types. 730 */ 731 bld.ASR(offset(dst, bld, c), 732 offset(dst, bld, c), brw_imm_ud(32 - widths[c])); 733 } 734 } 735 736 return dst; 737 } 738 739 /** 740 * Convert an integer vector into another integer vector of the 741 * specified bit widths, properly handling overflow. 742 */ 743 fs_reg emit_convert_to_integer(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)744 emit_convert_to_integer(const fs_builder &bld, const fs_reg &src, 745 const color_u &widths, bool is_signed) 746 { 747 const unsigned s = (is_signed ? 1 : 0); 748 const fs_reg dst = bld.vgrf( 749 is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4); 750 assert(src.type == dst.type); 751 752 for (unsigned c = 0; c < 4; ++c) { 753 if (widths[c]) { 754 /* Clamp to the maximum value. */ 755 bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c), 756 brw_imm_d((int)scale(widths[c] - s)), 757 BRW_CONDITIONAL_L); 758 759 /* Clamp to the minimum value. */ 760 if (is_signed) 761 bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c), 762 brw_imm_d(-(int)scale(widths[c] - s) - 1), 763 BRW_CONDITIONAL_GE); 764 765 /* Mask off all but the bits we actually want. Otherwise, if 766 * we pass a negative number into the hardware when it's 767 * expecting something like UINT8, it will happily clamp it to 768 * +255 for us. 769 */ 770 if (is_signed && widths[c] < 32) 771 bld.AND(offset(dst, bld, c), offset(dst, bld, c), 772 brw_imm_d(scale(widths[c]))); 773 } 774 } 775 776 return dst; 777 } 778 779 /** 780 * Convert a normalized fixed-point vector of the specified signedness 781 * and bit widths into a floating point vector. 782 */ 783 fs_reg emit_convert_from_scaled(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)784 emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src, 785 const color_u &widths, bool is_signed) 786 { 787 const unsigned s = (is_signed ? 1 : 0); 788 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4); 789 790 for (unsigned c = 0; c < 4; ++c) { 791 if (widths[c]) { 792 /* Convert to float. */ 793 bld.MOV(offset(dst, bld, c), offset(src, bld, c)); 794 795 /* Divide by the normalization constants. */ 796 bld.MUL(offset(dst, bld, c), offset(dst, bld, c), 797 brw_imm_f(1.0f / scale(widths[c] - s))); 798 799 /* Clamp to the minimum value. */ 800 if (is_signed) 801 bld.emit_minmax(offset(dst, bld, c), 802 offset(dst, bld, c), brw_imm_f(-1.0f), 803 BRW_CONDITIONAL_GE); 804 } 805 } 806 return dst; 807 } 808 809 /** 810 * Convert a floating-point vector into a normalized fixed-point vector 811 * of the specified signedness and bit widths. 812 */ 813 fs_reg emit_convert_to_scaled(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)814 emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src, 815 const color_u &widths, bool is_signed) 816 { 817 const unsigned s = (is_signed ? 1 : 0); 818 const fs_reg dst = bld.vgrf( 819 is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4); 820 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); 821 822 for (unsigned c = 0; c < 4; ++c) { 823 if (widths[c]) { 824 /* Clamp the normalized floating-point argument. */ 825 if (is_signed) { 826 bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c), 827 brw_imm_f(-1.0f), BRW_CONDITIONAL_GE); 828 829 bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c), 830 brw_imm_f(1.0f), BRW_CONDITIONAL_L); 831 } else { 832 set_saturate(true, bld.MOV(offset(fdst, bld, c), 833 offset(src, bld, c))); 834 } 835 836 /* Multiply by the normalization constants. */ 837 bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c), 838 brw_imm_f((float)scale(widths[c] - s))); 839 840 /* Convert to integer. */ 841 bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c)); 842 bld.MOV(offset(dst, bld, c), offset(fdst, bld, c)); 843 844 /* Mask off all but the bits we actually want. Otherwise, if 845 * we pass a negative number into the hardware when it's 846 * expecting something like UINT8, it will happily clamp it to 847 * +255 for us. 848 */ 849 if (is_signed && widths[c] < 32) 850 bld.AND(offset(dst, bld, c), offset(dst, bld, c), 851 brw_imm_d(scale(widths[c]))); 852 } 853 } 854 855 return dst; 856 } 857 858 /** 859 * Convert a floating point vector of the specified bit widths into a 860 * 32-bit floating point vector. 861 */ 862 fs_reg emit_convert_from_float(const fs_builder & bld,const fs_reg & src,const color_u & widths)863 emit_convert_from_float(const fs_builder &bld, const fs_reg &src, 864 const color_u &widths) 865 { 866 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); 867 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); 868 869 for (unsigned c = 0; c < 4; ++c) { 870 if (widths[c]) { 871 bld.MOV(offset(dst, bld, c), offset(src, bld, c)); 872 873 /* Extend 10-bit and 11-bit floating point numbers to 15 bits. 874 * This works because they have a 5-bit exponent just like the 875 * 16-bit floating point format, and they have no sign bit. 876 */ 877 if (widths[c] < 16) 878 bld.SHL(offset(dst, bld, c), 879 offset(dst, bld, c), brw_imm_ud(15 - widths[c])); 880 881 /* Convert to 32-bit floating point. */ 882 bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c)); 883 } 884 } 885 886 return fdst; 887 } 888 889 /** 890 * Convert a vector into a floating point vector of the specified bit 891 * widths. 892 */ 893 fs_reg emit_convert_to_float(const fs_builder & bld,const fs_reg & src,const color_u & widths)894 emit_convert_to_float(const fs_builder &bld, const fs_reg &src, 895 const color_u &widths) 896 { 897 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); 898 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); 899 900 for (unsigned c = 0; c < 4; ++c) { 901 if (widths[c]) { 902 bld.MOV(offset(fdst, bld, c), offset(src, bld, c)); 903 904 /* Clamp to the minimum value. */ 905 if (widths[c] < 16) 906 bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c), 907 brw_imm_f(0.0f), BRW_CONDITIONAL_GE); 908 909 /* Convert to 16-bit floating-point. */ 910 bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c)); 911 912 /* Discard the least significant bits to get floating point 913 * numbers of the requested width. This works because the 914 * 10-bit and 11-bit floating point formats have a 5-bit 915 * exponent just like the 16-bit format, and they have no sign 916 * bit. 917 */ 918 if (widths[c] < 16) 919 bld.SHR(offset(dst, bld, c), offset(dst, bld, c), 920 brw_imm_ud(15 - widths[c])); 921 } 922 } 923 924 return dst; 925 } 926 927 /** 928 * Fill missing components of a vector with 0, 0, 0, 1. 929 */ 930 fs_reg emit_pad(const fs_builder & bld,const fs_reg & src,const color_u & widths)931 emit_pad(const fs_builder &bld, const fs_reg &src, 932 const color_u &widths) 933 { 934 const fs_reg dst = bld.vgrf(src.type, 4); 935 const unsigned pad[] = { 0, 0, 0, 1 }; 936 937 for (unsigned c = 0; c < 4; ++c) 938 bld.MOV(offset(dst, bld, c), 939 widths[c] ? offset(src, bld, c) 940 : fs_reg(brw_imm_ud(pad[c]))); 941 942 return dst; 943 } 944 } 945 } 946 947 namespace brw { 948 namespace image_access { 949 /** 950 * Load a vector from a surface of the given format and dimensionality 951 * at the given coordinates. \p surf_dims and \p arr_dims give the 952 * number of non-array and array coordinates of the image respectively. 953 */ 954 fs_reg emit_image_load(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,unsigned surf_dims,unsigned arr_dims,unsigned gl_format)955 emit_image_load(const fs_builder &bld, 956 const fs_reg &image, const fs_reg &addr, 957 unsigned surf_dims, unsigned arr_dims, 958 unsigned gl_format) 959 { 960 using namespace image_format_info; 961 using namespace image_format_conversion; 962 using namespace image_validity; 963 using namespace image_coordinates; 964 using namespace surface_access; 965 const gen_device_info *devinfo = bld.shader->devinfo; 966 const isl_format format = isl_format_for_gl_format(gl_format); 967 const isl_format lower_format = 968 isl_lower_storage_image_format(devinfo, format); 969 fs_reg tmp; 970 971 /* Transform the image coordinates into actual surface coordinates. */ 972 const fs_reg saddr = 973 emit_image_coordinates(bld, addr, surf_dims, arr_dims, format); 974 const unsigned dims = 975 num_image_coordinates(bld, surf_dims, arr_dims, format); 976 977 if (isl_has_matching_typed_storage_image_format(devinfo, format)) { 978 /* Hopefully we get here most of the time... */ 979 tmp = emit_typed_read(bld, image, saddr, dims, 980 isl_format_get_num_channels(lower_format)); 981 } else { 982 /* Untyped surface reads return 32 bits of the surface per 983 * component, without any sort of unpacking or type conversion, 984 */ 985 const unsigned size = isl_format_get_layout(format)->bpb / 32; 986 /* they don't properly handle out of bounds access, so we have to 987 * check manually if the coordinates are valid and predicate the 988 * surface read on the result, 989 */ 990 const brw_predicate pred = 991 emit_untyped_image_check(bld, image, 992 emit_bounds_check(bld, image, 993 saddr, dims)); 994 995 /* and they don't know about surface coordinates, we need to 996 * convert them to a raw memory offset. 997 */ 998 const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims); 999 1000 tmp = emit_untyped_read(bld, image, laddr, 1, size, pred); 1001 1002 /* An out of bounds surface access should give zero as result. */ 1003 for (unsigned c = 0; c < size; ++c) 1004 set_predicate(pred, bld.SEL(offset(tmp, bld, c), 1005 offset(tmp, bld, c), brw_imm_d(0))); 1006 } 1007 1008 /* Set the register type to D instead of UD if the data type is 1009 * represented as a signed integer in memory so that sign extension 1010 * is handled correctly by unpack. 1011 */ 1012 if (needs_sign_extension(format)) 1013 tmp = retype(tmp, BRW_REGISTER_TYPE_D); 1014 1015 if (!has_supported_bit_layout(devinfo, format)) { 1016 /* Unpack individual vector components from the bitfield if the 1017 * hardware is unable to do it for us. 1018 */ 1019 if (has_split_bit_layout(devinfo, format)) 1020 tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format), 1021 get_bit_widths(lower_format)); 1022 else 1023 tmp = emit_unpack(bld, tmp, get_bit_shifts(format), 1024 get_bit_widths(format)); 1025 1026 } else if ((needs_sign_extension(format) && 1027 !is_conversion_trivial(devinfo, format)) || 1028 has_undefined_high_bits(devinfo, format)) { 1029 /* Perform a trivial unpack even though the bit layout matches in 1030 * order to get the most significant bits of each component 1031 * initialized properly. 1032 */ 1033 tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96), 1034 get_bit_widths(format)); 1035 } 1036 1037 if (!isl_format_has_int_channel(format)) { 1038 if (is_conversion_trivial(devinfo, format)) { 1039 /* Just need to cast the vector to the target type. */ 1040 tmp = retype(tmp, BRW_REGISTER_TYPE_F); 1041 } else { 1042 /* Do the right sort of type conversion to float. */ 1043 if (isl_format_has_float_channel(format)) 1044 tmp = emit_convert_from_float( 1045 bld, tmp, get_bit_widths(format)); 1046 else 1047 tmp = emit_convert_from_scaled( 1048 bld, tmp, get_bit_widths(format), 1049 isl_format_has_snorm_channel(format)); 1050 } 1051 } 1052 1053 /* Initialize missing components of the result. */ 1054 return emit_pad(bld, tmp, get_bit_widths(format)); 1055 } 1056 1057 /** 1058 * Store a vector in a surface of the given format and dimensionality at 1059 * the given coordinates. \p surf_dims and \p arr_dims give the number 1060 * of non-array and array coordinates of the image respectively. 1061 */ 1062 void emit_image_store(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,const fs_reg & src,unsigned surf_dims,unsigned arr_dims,unsigned gl_format)1063 emit_image_store(const fs_builder &bld, const fs_reg &image, 1064 const fs_reg &addr, const fs_reg &src, 1065 unsigned surf_dims, unsigned arr_dims, 1066 unsigned gl_format) 1067 { 1068 using namespace image_format_info; 1069 using namespace image_format_conversion; 1070 using namespace image_validity; 1071 using namespace image_coordinates; 1072 using namespace surface_access; 1073 const isl_format format = isl_format_for_gl_format(gl_format); 1074 const gen_device_info *devinfo = bld.shader->devinfo; 1075 1076 /* Transform the image coordinates into actual surface coordinates. */ 1077 const fs_reg saddr = 1078 emit_image_coordinates(bld, addr, surf_dims, arr_dims, format); 1079 const unsigned dims = 1080 num_image_coordinates(bld, surf_dims, arr_dims, format); 1081 1082 if (gl_format == GL_NONE) { 1083 /* We don't know what the format is, but that's fine because it 1084 * implies write-only access, and typed surface writes are always 1085 * able to take care of type conversion and packing for us. 1086 */ 1087 emit_typed_write(bld, image, saddr, src, dims, 4); 1088 1089 } else { 1090 const isl_format lower_format = 1091 isl_lower_storage_image_format(devinfo, format); 1092 fs_reg tmp = src; 1093 1094 if (!is_conversion_trivial(devinfo, format)) { 1095 /* Do the right sort of type conversion. */ 1096 if (isl_format_has_float_channel(format)) 1097 tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format)); 1098 1099 else if (isl_format_has_int_channel(format)) 1100 tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format), 1101 isl_format_has_sint_channel(format)); 1102 1103 else 1104 tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format), 1105 isl_format_has_snorm_channel(format)); 1106 } 1107 1108 /* We're down to bit manipulation at this point. */ 1109 tmp = retype(tmp, BRW_REGISTER_TYPE_UD); 1110 1111 if (!has_supported_bit_layout(devinfo, format)) { 1112 /* Pack the vector components into a bitfield if the hardware 1113 * is unable to do it for us. 1114 */ 1115 if (has_split_bit_layout(devinfo, format)) 1116 tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format), 1117 get_bit_widths(lower_format)); 1118 1119 else 1120 tmp = emit_pack(bld, tmp, get_bit_shifts(format), 1121 get_bit_widths(format)); 1122 } 1123 1124 if (isl_has_matching_typed_storage_image_format(devinfo, format)) { 1125 /* Hopefully we get here most of the time... */ 1126 emit_typed_write(bld, image, saddr, tmp, dims, 1127 isl_format_get_num_channels(lower_format)); 1128 1129 } else { 1130 /* Untyped surface writes store 32 bits of the surface per 1131 * component, without any sort of packing or type conversion, 1132 */ 1133 const unsigned size = isl_format_get_layout(format)->bpb / 32; 1134 1135 /* they don't properly handle out of bounds access, so we have 1136 * to check manually if the coordinates are valid and predicate 1137 * the surface write on the result, 1138 */ 1139 const brw_predicate pred = 1140 emit_untyped_image_check(bld, image, 1141 emit_bounds_check(bld, image, 1142 saddr, dims)); 1143 1144 /* and, phew, they don't know about surface coordinates, we 1145 * need to convert them to a raw memory offset. 1146 */ 1147 const fs_reg laddr = emit_address_calculation( 1148 bld, image, saddr, dims); 1149 1150 emit_untyped_write(bld, image, laddr, tmp, 1, size, pred); 1151 } 1152 } 1153 } 1154 1155 /** 1156 * Perform an atomic read-modify-write operation in a surface of the 1157 * given dimensionality at the given coordinates. \p surf_dims and \p 1158 * arr_dims give the number of non-array and array coordinates of the 1159 * image respectively. Main building block of the imageAtomic GLSL 1160 * built-ins. 1161 */ 1162 fs_reg emit_image_atomic(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned surf_dims,unsigned arr_dims,unsigned rsize,unsigned op)1163 emit_image_atomic(const fs_builder &bld, 1164 const fs_reg &image, const fs_reg &addr, 1165 const fs_reg &src0, const fs_reg &src1, 1166 unsigned surf_dims, unsigned arr_dims, 1167 unsigned rsize, unsigned op) 1168 { 1169 using namespace image_validity; 1170 using namespace image_coordinates; 1171 using namespace surface_access; 1172 /* Avoid performing an atomic operation on an unbound surface. */ 1173 const brw_predicate pred = emit_typed_atomic_check(bld, image); 1174 1175 /* Transform the image coordinates into actual surface coordinates. */ 1176 const fs_reg saddr = 1177 emit_image_coordinates(bld, addr, surf_dims, arr_dims, 1178 ISL_FORMAT_R32_UINT); 1179 const unsigned dims = 1180 num_image_coordinates(bld, surf_dims, arr_dims, 1181 ISL_FORMAT_R32_UINT); 1182 1183 /* Thankfully we can do without untyped atomics here. */ 1184 const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1, 1185 dims, rsize, op, pred); 1186 1187 /* An unbound surface access should give zero as result. */ 1188 if (rsize && pred) 1189 set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0))); 1190 1191 return retype(tmp, src0.type); 1192 } 1193 } 1194 } 1195