• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2013-2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "isl/isl.h"
25 #include "brw_fs_surface_builder.h"
26 #include "brw_fs.h"
27 
28 using namespace brw;
29 
30 namespace brw {
31    namespace surface_access {
32       namespace {
33          /**
34           * Generate a logical send opcode for a surface message and return
35           * the result.
36           */
37          fs_reg
emit_send(const fs_builder & bld,enum opcode opcode,const fs_reg & addr,const fs_reg & src,const fs_reg & surface,unsigned dims,unsigned arg,unsigned rsize,brw_predicate pred=BRW_PREDICATE_NONE)38          emit_send(const fs_builder &bld, enum opcode opcode,
39                    const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
40                    unsigned dims, unsigned arg, unsigned rsize,
41                    brw_predicate pred = BRW_PREDICATE_NONE)
42          {
43             /* Reduce the dynamically uniform surface index to a single
44              * scalar.
45              */
46             const fs_reg usurface = bld.emit_uniformize(surface);
47             const fs_reg srcs[] = {
48                addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
49             };
50             const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
51             fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
52 
53             inst->size_written = rsize * dst.component_size(inst->exec_size);
54             inst->predicate = pred;
55             return dst;
56          }
57       }
58 
59       /**
60        * Emit an untyped surface read opcode.  \p dims determines the number
61        * of components of the address and \p size the number of components of
62        * the returned value.
63        */
64       fs_reg
emit_untyped_read(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,unsigned dims,unsigned size,brw_predicate pred)65       emit_untyped_read(const fs_builder &bld,
66                         const fs_reg &surface, const fs_reg &addr,
67                         unsigned dims, unsigned size,
68                         brw_predicate pred)
69       {
70          return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
71                           addr, fs_reg(), surface, dims, size, size, pred);
72       }
73 
74       /**
75        * Emit an untyped surface write opcode.  \p dims determines the number
76        * of components of the address and \p size the number of components of
77        * the argument.
78        */
79       void
emit_untyped_write(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src,unsigned dims,unsigned size,brw_predicate pred)80       emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
81                          const fs_reg &addr, const fs_reg &src,
82                          unsigned dims, unsigned size,
83                          brw_predicate pred)
84       {
85          emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
86                    addr, src, surface, dims, size, 0, pred);
87       }
88 
89       /**
90        * Emit an untyped surface atomic opcode.  \p dims determines the number
91        * of components of the address and \p rsize the number of components of
92        * the returned value (either zero or one).
93        */
94       fs_reg
emit_untyped_atomic(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned dims,unsigned rsize,unsigned op,brw_predicate pred)95       emit_untyped_atomic(const fs_builder &bld,
96                           const fs_reg &surface, const fs_reg &addr,
97                           const fs_reg &src0, const fs_reg &src1,
98                           unsigned dims, unsigned rsize, unsigned op,
99                           brw_predicate pred)
100       {
101          /* FINISHME: Factor out this frequently recurring pattern into a
102           * helper function.
103           */
104          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
105          const fs_reg srcs[] = { src0, src1 };
106          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
107          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
108 
109          return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
110                           addr, tmp, surface, dims, op, rsize, pred);
111       }
112 
113       /**
114        * Emit a typed surface read opcode.  \p dims determines the number of
115        * components of the address and \p size the number of components of the
116        * returned value.
117        */
118       fs_reg
emit_typed_read(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,unsigned dims,unsigned size)119       emit_typed_read(const fs_builder &bld, const fs_reg &surface,
120                       const fs_reg &addr, unsigned dims, unsigned size)
121       {
122          return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
123                           addr, fs_reg(), surface, dims, size, size);
124       }
125 
126       /**
127        * Emit a typed surface write opcode.  \p dims determines the number of
128        * components of the address and \p size the number of components of the
129        * argument.
130        */
131       void
emit_typed_write(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src,unsigned dims,unsigned size)132       emit_typed_write(const fs_builder &bld, const fs_reg &surface,
133                        const fs_reg &addr, const fs_reg &src,
134                        unsigned dims, unsigned size)
135       {
136          emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
137                    addr, src, surface, dims, size, 0);
138       }
139 
140       /**
141        * Emit a typed surface atomic opcode.  \p dims determines the number of
142        * components of the address and \p rsize the number of components of
143        * the returned value (either zero or one).
144        */
145       fs_reg
emit_typed_atomic(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned dims,unsigned rsize,unsigned op,brw_predicate pred)146       emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
147                         const fs_reg &addr,
148                         const fs_reg &src0, const fs_reg &src1,
149                         unsigned dims, unsigned rsize, unsigned op,
150                         brw_predicate pred)
151       {
152          /* FINISHME: Factor out this frequently recurring pattern into a
153           * helper function.
154           */
155          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
156          const fs_reg srcs[] = { src0, src1 };
157          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
158          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
159 
160          return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
161                           addr, tmp, surface, dims, op, rsize);
162       }
163    }
164 }
165 
166 namespace {
167    namespace image_format_info {
168       /* The higher compiler layers use the GL enums for image formats even if
169        * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
170        * enum before we can use them.
171        */
172       enum isl_format
isl_format_for_gl_format(uint32_t gl_format)173       isl_format_for_gl_format(uint32_t gl_format)
174       {
175          switch (gl_format) {
176          case GL_R8:             return ISL_FORMAT_R8_UNORM;
177          case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
178          case GL_R8UI:           return ISL_FORMAT_R8_UINT;
179          case GL_R8I:            return ISL_FORMAT_R8_SINT;
180          case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
181          case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
182          case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
183          case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
184          case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
185          case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
186          case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
187          case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
188          case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
189          case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
190          case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
191          case GL_R16:            return ISL_FORMAT_R16_UNORM;
192          case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
193          case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
194          case GL_R16UI:          return ISL_FORMAT_R16_UINT;
195          case GL_R16I:           return ISL_FORMAT_R16_SINT;
196          case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
197          case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
198          case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
199          case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
200          case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
201          case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
202          case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
203          case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
204          case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
205          case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
206          case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
207          case GL_R32UI:          return ISL_FORMAT_R32_UINT;
208          case GL_R32I:           return ISL_FORMAT_R32_SINT;
209          case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
210          case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
211          case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
212          case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
213          case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
214          case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
215          case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
216          default:
217             assert(!"Invalid image format");
218             return ISL_FORMAT_UNSUPPORTED;
219          }
220       }
221 
222       /**
223        * Simple 4-tuple of scalars used to pass around per-color component
224        * values.
225        */
226       struct color_u {
color_u__anon9649c4820211::image_format_info::color_u227          color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
228          {
229          }
230 
color_u__anon9649c4820211::image_format_info::color_u231          color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
232             r(r), g(g), b(b), a(a)
233          {
234          }
235 
236          unsigned
operator []__anon9649c4820211::image_format_info::color_u237          operator[](unsigned i) const
238          {
239             const unsigned xs[] = { r, g, b, a };
240             return xs[i];
241          }
242 
243          unsigned r, g, b, a;
244       };
245 
246       /**
247        * Return the per-channel bitfield widths for a given image format.
248        */
249       inline color_u
get_bit_widths(isl_format format)250       get_bit_widths(isl_format format)
251       {
252          const isl_format_layout *fmtl = isl_format_get_layout(format);
253 
254          return color_u(fmtl->channels.r.bits,
255                         fmtl->channels.g.bits,
256                         fmtl->channels.b.bits,
257                         fmtl->channels.a.bits);
258       }
259 
260       /**
261        * Return the per-channel bitfield shifts for a given image format.
262        */
263       inline color_u
get_bit_shifts(isl_format format)264       get_bit_shifts(isl_format format)
265       {
266          const color_u widths = get_bit_widths(format);
267          return color_u(0, widths.r, widths.r + widths.g,
268                         widths.r + widths.g + widths.b);
269       }
270 
271       /**
272        * Return true if all present components have the same bit width.
273        */
274       inline bool
is_homogeneous(isl_format format)275       is_homogeneous(isl_format format)
276       {
277          const color_u widths = get_bit_widths(format);
278          return ((widths.g == 0 || widths.g == widths.r) &&
279                  (widths.b == 0 || widths.b == widths.r) &&
280                  (widths.a == 0 || widths.a == widths.r));
281       }
282 
283       /**
284        * Return true if the format conversion boils down to a trivial copy.
285        */
286       inline bool
is_conversion_trivial(const gen_device_info * devinfo,isl_format format)287       is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
288       {
289          return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
290                  format == isl_lower_storage_image_format(devinfo, format);
291       }
292 
293       /**
294        * Return true if the hardware natively supports some format with
295        * compatible bitfield layout, but possibly different data types.
296        */
297       inline bool
has_supported_bit_layout(const gen_device_info * devinfo,isl_format format)298       has_supported_bit_layout(const gen_device_info *devinfo,
299                                isl_format format)
300       {
301          const color_u widths = get_bit_widths(format);
302          const color_u lower_widths = get_bit_widths(
303             isl_lower_storage_image_format(devinfo, format));
304 
305          return (widths.r == lower_widths.r &&
306                  widths.g == lower_widths.g &&
307                  widths.b == lower_widths.b &&
308                  widths.a == lower_widths.a);
309       }
310 
311       /**
312        * Return true if we are required to spread individual components over
313        * several components of the format used by the hardware (RG32 and
314        * friends implemented as RGBA16UI).
315        */
316       inline bool
has_split_bit_layout(const gen_device_info * devinfo,isl_format format)317       has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
318       {
319          const isl_format lower_format =
320             isl_lower_storage_image_format(devinfo, format);
321 
322          return (isl_format_get_num_channels(format) <
323                  isl_format_get_num_channels(lower_format));
324       }
325 
326       /**
327        * Return true if the hardware returns garbage in the unused high bits
328        * of each component.  This may happen on IVB because we rely on the
329        * undocumented behavior that typed reads from surfaces of the
330        * unsupported R8 and R16 formats return useful data in their least
331        * significant bits.
332        */
333       inline bool
has_undefined_high_bits(const gen_device_info * devinfo,isl_format format)334       has_undefined_high_bits(const gen_device_info *devinfo,
335                               isl_format format)
336       {
337          const isl_format lower_format =
338             isl_lower_storage_image_format(devinfo, format);
339 
340          return (devinfo->gen == 7 && !devinfo->is_haswell &&
341                  (lower_format == ISL_FORMAT_R16_UINT ||
342                   lower_format == ISL_FORMAT_R8_UINT));
343       }
344 
345       /**
346        * Return true if the format represents values as signed integers
347        * requiring sign extension when unpacking.
348        */
349       inline bool
needs_sign_extension(isl_format format)350       needs_sign_extension(isl_format format)
351       {
352          return isl_format_has_snorm_channel(format) ||
353                 isl_format_has_sint_channel(format);
354       }
355    }
356 
357    namespace image_validity {
358       /**
359        * Check whether the bound image is suitable for untyped access.
360        */
361       brw_predicate
emit_untyped_image_check(const fs_builder & bld,const fs_reg & image,brw_predicate pred)362       emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
363                                brw_predicate pred)
364       {
365          const gen_device_info *devinfo = bld.shader->devinfo;
366          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
367 
368          if (devinfo->gen == 7 && !devinfo->is_haswell) {
369             /* Check whether the first stride component (i.e. the Bpp value)
370              * is greater than four, what on Gen7 indicates that a surface of
371              * type RAW has been bound for untyped access.  Reading or writing
372              * to a surface of type other than RAW using untyped surface
373              * messages causes a hang on IVB and VLV.
374              */
375             set_predicate(pred,
376                           bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
377                                   BRW_CONDITIONAL_G));
378 
379             return BRW_PREDICATE_NORMAL;
380          } else {
381             /* More recent generations handle the format mismatch
382              * gracefully.
383              */
384             return pred;
385          }
386       }
387 
388       /**
389        * Check whether there is an image bound at the given index and write
390        * the comparison result to f0.0.  Returns an appropriate predication
391        * mode to use on subsequent image operations.
392        */
393       brw_predicate
emit_typed_atomic_check(const fs_builder & bld,const fs_reg & image)394       emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
395       {
396          const gen_device_info *devinfo = bld.shader->devinfo;
397          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
398 
399          if (devinfo->gen == 7 && !devinfo->is_haswell) {
400             /* Check the first component of the size field to find out if the
401              * image is bound.  Necessary on IVB for typed atomics because
402              * they don't seem to respect null surfaces and will happily
403              * corrupt or read random memory when no image is bound.
404              */
405             bld.CMP(bld.null_reg_ud(),
406                     retype(size, BRW_REGISTER_TYPE_UD),
407                     brw_imm_d(0), BRW_CONDITIONAL_NZ);
408 
409             return BRW_PREDICATE_NORMAL;
410          } else {
411             /* More recent platforms implement compliant behavior when a null
412              * surface is bound.
413              */
414             return BRW_PREDICATE_NONE;
415          }
416       }
417 
418       /**
419        * Check whether the provided coordinates are within the image bounds
420        * and write the comparison result to f0.0.  Returns an appropriate
421        * predication mode to use on subsequent image operations.
422        */
423       brw_predicate
emit_bounds_check(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,unsigned dims)424       emit_bounds_check(const fs_builder &bld, const fs_reg &image,
425                         const fs_reg &addr, unsigned dims)
426       {
427          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
428 
429          for (unsigned c = 0; c < dims; ++c)
430             set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
431                           bld.CMP(bld.null_reg_ud(),
432                                   offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
433                                   offset(size, bld, c),
434                                   BRW_CONDITIONAL_L));
435 
436          return BRW_PREDICATE_NORMAL;
437       }
438    }
439 
440    namespace image_coordinates {
441       /**
442        * Return the total number of coordinates needed to address a texel of
443        * the surface, which may be more than the sum of \p surf_dims and \p
444        * arr_dims if padding is required.
445        */
446       unsigned
num_image_coordinates(const fs_builder & bld,unsigned surf_dims,unsigned arr_dims,isl_format format)447       num_image_coordinates(const fs_builder &bld,
448                             unsigned surf_dims, unsigned arr_dims,
449                             isl_format format)
450       {
451          /* HSW in vec4 mode and our software coordinate handling for untyped
452           * reads want the array index to be at the Z component.
453           */
454          const bool array_index_at_z =
455             format != ISL_FORMAT_UNSUPPORTED &&
456             !isl_has_matching_typed_storage_image_format(
457                bld.shader->devinfo, format);
458          const unsigned zero_dims =
459             ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
460 
461          return surf_dims + zero_dims + arr_dims;
462       }
463 
464       /**
465        * Transform image coordinates into the form expected by the
466        * implementation.
467        */
468       fs_reg
emit_image_coordinates(const fs_builder & bld,const fs_reg & addr,unsigned surf_dims,unsigned arr_dims,isl_format format)469       emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
470                              unsigned surf_dims, unsigned arr_dims,
471                              isl_format format)
472       {
473          const unsigned dims =
474             num_image_coordinates(bld, surf_dims, arr_dims, format);
475 
476          if (dims > surf_dims + arr_dims) {
477             assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
478             /* The array index is required to be passed in as the Z component,
479              * insert a zero at the Y component to shift it to the right
480              * position.
481              *
482              * FINISHME: Factor out this frequently recurring pattern into a
483              * helper function.
484              */
485             const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
486             const fs_reg dst = bld.vgrf(addr.type, dims);
487             bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
488             return dst;
489          } else {
490             return addr;
491          }
492       }
493 
494       /**
495        * Calculate the offset in memory of the texel given by \p coord.
496        *
497        * This is meant to be used with untyped surface messages to access a
498        * tiled surface, what involves taking into account the tiling and
499        * swizzling modes of the surface manually so it will hopefully not
500        * happen very often.
501        *
502        * The tiling algorithm implemented here matches either the X or Y
503        * tiling layouts supported by the hardware depending on the tiling
504        * coefficients passed to the program as uniforms.  See Volume 1 Part 2
505        * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
506        * explanation of the hardware tiling format.
507        */
508       fs_reg
emit_address_calculation(const fs_builder & bld,const fs_reg & image,const fs_reg & coord,unsigned dims)509       emit_address_calculation(const fs_builder &bld, const fs_reg &image,
510                                const fs_reg &coord, unsigned dims)
511       {
512          const gen_device_info *devinfo = bld.shader->devinfo;
513          const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
514          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
515          const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
516          const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
517          const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
518          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
519          const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
520          const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
521          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
522 
523          /* Shift the coordinates by the fixed surface offset.  It may be
524           * non-zero if the image is a single slice of a higher-dimensional
525           * surface, or if a non-zero mipmap level of the surface is bound to
526           * the pipeline.  The offset needs to be applied here rather than at
527           * surface state set-up time because the desired slice-level may
528           * start mid-tile, so simply shifting the surface base address
529           * wouldn't give a well-formed tiled surface in the general case.
530           */
531          for (unsigned c = 0; c < 2; ++c)
532             bld.ADD(offset(addr, bld, c), offset(off, bld, c),
533                     (c < dims ?
534                      offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
535                      fs_reg(brw_imm_d(0))));
536 
537          /* The layout of 3-D textures in memory is sort-of like a tiling
538           * format.  At each miplevel, the slices are arranged in rows of
539           * 2^level slices per row.  The slice row is stored in tmp.y and
540           * the slice within the row is stored in tmp.x.
541           *
542           * The layout of 2-D array textures and cubemaps is much simpler:
543           * Depending on whether the ARYSPC_LOD0 layout is in use it will be
544           * stored in memory as an array of slices, each one being a 2-D
545           * arrangement of miplevels, or as a 2D arrangement of miplevels,
546           * each one being an array of slices.  In either case the separation
547           * between slices of the same LOD is equal to the qpitch value
548           * provided as stride.w.
549           *
550           * This code can be made to handle either 2D arrays and 3D textures
551           * by passing in the miplevel as tile.z for 3-D textures and 0 in
552           * tile.z for 2-D array textures.
553           *
554           * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
555           * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
556           * of the hardware 3D texture and 2D array layouts.
557           */
558          if (dims > 2) {
559             /* Decompose z into a major (tmp.y) and a minor (tmp.x)
560              * index.
561              */
562             bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
563                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
564             bld.SHR(offset(tmp, bld, 1),
565                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
566                     offset(tile, bld, 2));
567 
568             /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
569              * slice offset.
570              */
571             for (unsigned c = 0; c < 2; ++c) {
572                bld.MUL(offset(tmp, bld, c),
573                        offset(stride, bld, 2 + c), offset(tmp, bld, c));
574                bld.ADD(offset(addr, bld, c),
575                        offset(addr, bld, c), offset(tmp, bld, c));
576             }
577          }
578 
579          if (dims > 1) {
580             /* Calculate the major/minor x and y indices.  In order to
581              * accommodate both X and Y tiling, the Y-major tiling format is
582              * treated as being a bunch of narrow X-tiles placed next to each
583              * other.  This means that the tile width for Y-tiling is actually
584              * the width of one sub-column of the Y-major tile where each 4K
585              * tile has 8 512B sub-columns.
586              *
587              * The major Y value is the row of tiles in which the pixel lives.
588              * The major X value is the tile sub-column in which the pixel
589              * lives; for X tiling, this is the same as the tile column, for Y
590              * tiling, each tile has 8 sub-columns.  The minor X and Y indices
591              * are the position within the sub-column.
592              */
593             for (unsigned c = 0; c < 2; ++c) {
594                /* Calculate the minor x and y indices. */
595                bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
596                        brw_imm_d(0), offset(addr, bld, c));
597 
598                /* Calculate the major x and y indices. */
599                bld.SHR(offset(major, bld, c),
600                        offset(addr, bld, c), offset(tile, bld, c));
601             }
602 
603             /* Calculate the texel index from the start of the tile row and
604              * the vertical coordinate of the row.
605              * Equivalent to:
606              *   tmp.x = (major.x << tile.y << tile.x) +
607              *           (minor.y << tile.x) + minor.x
608              *   tmp.y = major.y << tile.y
609              */
610             bld.SHL(tmp, major, offset(tile, bld, 1));
611             bld.ADD(tmp, tmp, offset(minor, bld, 1));
612             bld.SHL(tmp, tmp, offset(tile, bld, 0));
613             bld.ADD(tmp, tmp, minor);
614             bld.SHL(offset(tmp, bld, 1),
615                     offset(major, bld, 1), offset(tile, bld, 1));
616 
617             /* Add it to the start of the tile row. */
618             bld.MUL(offset(tmp, bld, 1),
619                     offset(tmp, bld, 1), offset(stride, bld, 1));
620             bld.ADD(tmp, tmp, offset(tmp, bld, 1));
621 
622             /* Multiply by the Bpp value. */
623             bld.MUL(dst, tmp, stride);
624 
625             if (devinfo->gen < 8 && !devinfo->is_baytrail) {
626                /* Take into account the two dynamically specified shifts.
627                 * Both need are used to implement swizzling of X-tiled
628                 * surfaces.  For Y-tiled surfaces only one bit needs to be
629                 * XOR-ed with bit 6 of the memory address, so a swz value of
630                 * 0xff (actually interpreted as 31 by the hardware) will be
631                 * provided to cause the relevant bit of tmp.y to be zero and
632                 * turn the first XOR into the identity.  For linear surfaces
633                 * or platforms lacking address swizzling both shifts will be
634                 * 0xff causing the relevant bits of both tmp.x and .y to be
635                 * zero, what effectively disables swizzling.
636                 */
637                for (unsigned c = 0; c < 2; ++c)
638                   bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
639 
640                /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
641                bld.XOR(tmp, tmp, offset(tmp, bld, 1));
642                bld.AND(tmp, tmp, brw_imm_d(1 << 6));
643                bld.XOR(dst, dst, tmp);
644             }
645 
646          } else {
647             /* Multiply by the Bpp/stride value.  Note that the addr.y may be
648              * non-zero even if the image is one-dimensional because a
649              * vertical offset may have been applied above to select a
650              * non-zero slice or level of a higher-dimensional texture.
651              */
652             bld.MUL(offset(addr, bld, 1),
653                     offset(addr, bld, 1), offset(stride, bld, 1));
654             bld.ADD(addr, addr, offset(addr, bld, 1));
655             bld.MUL(dst, addr, stride);
656          }
657 
658          return dst;
659       }
660    }
661 
662    namespace image_format_conversion {
663       using image_format_info::color_u;
664 
665       namespace {
666          /**
667           * Maximum representable value in an unsigned integer with the given
668           * number of bits.
669           */
670          inline unsigned
scale(unsigned n)671          scale(unsigned n)
672          {
673             return (1 << n) - 1;
674          }
675       }
676 
677       /**
678        * Pack the vector \p src in a bitfield given the per-component bit
679        * shifts and widths.  Note that bitfield components are not allowed to
680        * cross 32-bit boundaries.
681        */
682       fs_reg
emit_pack(const fs_builder & bld,const fs_reg & src,const color_u & shifts,const color_u & widths)683       emit_pack(const fs_builder &bld, const fs_reg &src,
684                 const color_u &shifts, const color_u &widths)
685       {
686          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
687          bool seen[4] = {};
688 
689          for (unsigned c = 0; c < 4; ++c) {
690             if (widths[c]) {
691                const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
692 
693                /* Shift each component left to the correct bitfield position. */
694                bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
695 
696                /* Add everything up. */
697                if (seen[shifts[c] / 32]) {
698                   bld.OR(offset(dst, bld, shifts[c] / 32),
699                          offset(dst, bld, shifts[c] / 32), tmp);
700                } else {
701                   bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
702                   seen[shifts[c] / 32] = true;
703                }
704             }
705          }
706 
707          return dst;
708       }
709 
710       /**
711        * Unpack a vector from the bitfield \p src given the per-component bit
712        * shifts and widths.  Note that bitfield components are not allowed to
713        * cross 32-bit boundaries.
714        */
715       fs_reg
emit_unpack(const fs_builder & bld,const fs_reg & src,const color_u & shifts,const color_u & widths)716       emit_unpack(const fs_builder &bld, const fs_reg &src,
717                   const color_u &shifts, const color_u &widths)
718       {
719          const fs_reg dst = bld.vgrf(src.type, 4);
720 
721          for (unsigned c = 0; c < 4; ++c) {
722             if (widths[c]) {
723                /* Shift left to discard the most significant bits. */
724                bld.SHL(offset(dst, bld, c),
725                        offset(src, bld, shifts[c] / 32),
726                        brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
727 
728                /* Shift back to the least significant bits using an arithmetic
729                 * shift to get sign extension on signed types.
730                 */
731                bld.ASR(offset(dst, bld, c),
732                        offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
733             }
734          }
735 
736          return dst;
737       }
738 
739       /**
740        * Convert an integer vector into another integer vector of the
741        * specified bit widths, properly handling overflow.
742        */
743       fs_reg
emit_convert_to_integer(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)744       emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
745                               const color_u &widths, bool is_signed)
746       {
747          const unsigned s = (is_signed ? 1 : 0);
748          const fs_reg dst = bld.vgrf(
749             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
750          assert(src.type == dst.type);
751 
752          for (unsigned c = 0; c < 4; ++c) {
753             if (widths[c]) {
754                /* Clamp to the maximum value. */
755                bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
756                                brw_imm_d((int)scale(widths[c] - s)),
757                                BRW_CONDITIONAL_L);
758 
759                /* Clamp to the minimum value. */
760                if (is_signed)
761                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
762                                   brw_imm_d(-(int)scale(widths[c] - s) - 1),
763                                   BRW_CONDITIONAL_GE);
764 
765                /* Mask off all but the bits we actually want.  Otherwise, if
766                 * we pass a negative number into the hardware when it's
767                 * expecting something like UINT8, it will happily clamp it to
768                 * +255 for us.
769                 */
770                if (is_signed && widths[c] < 32)
771                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
772                           brw_imm_d(scale(widths[c])));
773             }
774          }
775 
776          return dst;
777       }
778 
779       /**
780        * Convert a normalized fixed-point vector of the specified signedness
781        * and bit widths into a floating point vector.
782        */
783       fs_reg
emit_convert_from_scaled(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)784       emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
785                                const color_u &widths, bool is_signed)
786       {
787          const unsigned s = (is_signed ? 1 : 0);
788          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
789 
790          for (unsigned c = 0; c < 4; ++c) {
791             if (widths[c]) {
792                /* Convert to float. */
793                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
794 
795                /* Divide by the normalization constants. */
796                bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
797                        brw_imm_f(1.0f / scale(widths[c] - s)));
798 
799                /* Clamp to the minimum value. */
800                if (is_signed)
801                   bld.emit_minmax(offset(dst, bld, c),
802                                   offset(dst, bld, c), brw_imm_f(-1.0f),
803                                   BRW_CONDITIONAL_GE);
804             }
805          }
806          return dst;
807       }
808 
809       /**
810        * Convert a floating-point vector into a normalized fixed-point vector
811        * of the specified signedness and bit widths.
812        */
813       fs_reg
emit_convert_to_scaled(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)814       emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
815                              const color_u &widths, bool is_signed)
816       {
817          const unsigned s = (is_signed ? 1 : 0);
818          const fs_reg dst = bld.vgrf(
819             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
820          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
821 
822          for (unsigned c = 0; c < 4; ++c) {
823             if (widths[c]) {
824                /* Clamp the normalized floating-point argument. */
825                if (is_signed) {
826                   bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
827                                   brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
828 
829                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
830                                   brw_imm_f(1.0f), BRW_CONDITIONAL_L);
831                } else {
832                   set_saturate(true, bld.MOV(offset(fdst, bld, c),
833                                              offset(src, bld, c)));
834                }
835 
836                /* Multiply by the normalization constants. */
837                bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
838                        brw_imm_f((float)scale(widths[c] - s)));
839 
840                /* Convert to integer. */
841                bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
842                bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
843 
844                /* Mask off all but the bits we actually want.  Otherwise, if
845                 * we pass a negative number into the hardware when it's
846                 * expecting something like UINT8, it will happily clamp it to
847                 * +255 for us.
848                 */
849                if (is_signed && widths[c] < 32)
850                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
851                           brw_imm_d(scale(widths[c])));
852             }
853          }
854 
855          return dst;
856       }
857 
858       /**
859        * Convert a floating point vector of the specified bit widths into a
860        * 32-bit floating point vector.
861        */
862       fs_reg
emit_convert_from_float(const fs_builder & bld,const fs_reg & src,const color_u & widths)863       emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
864                               const color_u &widths)
865       {
866          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
867          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
868 
869          for (unsigned c = 0; c < 4; ++c) {
870             if (widths[c]) {
871                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
872 
873                /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
874                 * This works because they have a 5-bit exponent just like the
875                 * 16-bit floating point format, and they have no sign bit.
876                 */
877                if (widths[c] < 16)
878                   bld.SHL(offset(dst, bld, c),
879                           offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
880 
881                /* Convert to 32-bit floating point. */
882                bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
883             }
884          }
885 
886          return fdst;
887       }
888 
889       /**
890        * Convert a vector into a floating point vector of the specified bit
891        * widths.
892        */
893       fs_reg
emit_convert_to_float(const fs_builder & bld,const fs_reg & src,const color_u & widths)894       emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
895                             const color_u &widths)
896       {
897          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
898          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
899 
900          for (unsigned c = 0; c < 4; ++c) {
901             if (widths[c]) {
902                bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
903 
904                /* Clamp to the minimum value. */
905                if (widths[c] < 16)
906                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
907                                   brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
908 
909                /* Convert to 16-bit floating-point. */
910                bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
911 
912                /* Discard the least significant bits to get floating point
913                 * numbers of the requested width.  This works because the
914                 * 10-bit and 11-bit floating point formats have a 5-bit
915                 * exponent just like the 16-bit format, and they have no sign
916                 * bit.
917                 */
918                if (widths[c] < 16)
919                   bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
920                           brw_imm_ud(15 - widths[c]));
921             }
922          }
923 
924          return dst;
925       }
926 
927       /**
928        * Fill missing components of a vector with 0, 0, 0, 1.
929        */
930       fs_reg
emit_pad(const fs_builder & bld,const fs_reg & src,const color_u & widths)931       emit_pad(const fs_builder &bld, const fs_reg &src,
932                const color_u &widths)
933       {
934          const fs_reg dst = bld.vgrf(src.type, 4);
935          const unsigned pad[] = { 0, 0, 0, 1 };
936 
937          for (unsigned c = 0; c < 4; ++c)
938             bld.MOV(offset(dst, bld, c),
939                     widths[c] ? offset(src, bld, c)
940                               : fs_reg(brw_imm_ud(pad[c])));
941 
942          return dst;
943       }
944    }
945 }
946 
947 namespace brw {
948    namespace image_access {
949       /**
950        * Load a vector from a surface of the given format and dimensionality
951        * at the given coordinates.  \p surf_dims and \p arr_dims give the
952        * number of non-array and array coordinates of the image respectively.
953        */
954       fs_reg
emit_image_load(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,unsigned surf_dims,unsigned arr_dims,unsigned gl_format)955       emit_image_load(const fs_builder &bld,
956                       const fs_reg &image, const fs_reg &addr,
957                       unsigned surf_dims, unsigned arr_dims,
958                       unsigned gl_format)
959       {
960          using namespace image_format_info;
961          using namespace image_format_conversion;
962          using namespace image_validity;
963          using namespace image_coordinates;
964          using namespace surface_access;
965          const gen_device_info *devinfo = bld.shader->devinfo;
966          const isl_format format = isl_format_for_gl_format(gl_format);
967          const isl_format lower_format =
968             isl_lower_storage_image_format(devinfo, format);
969          fs_reg tmp;
970 
971          /* Transform the image coordinates into actual surface coordinates. */
972          const fs_reg saddr =
973             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
974          const unsigned dims =
975             num_image_coordinates(bld, surf_dims, arr_dims, format);
976 
977          if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
978             /* Hopefully we get here most of the time... */
979             tmp = emit_typed_read(bld, image, saddr, dims,
980                                   isl_format_get_num_channels(lower_format));
981          } else {
982             /* Untyped surface reads return 32 bits of the surface per
983              * component, without any sort of unpacking or type conversion,
984              */
985             const unsigned size = isl_format_get_layout(format)->bpb / 32;
986             /* they don't properly handle out of bounds access, so we have to
987              * check manually if the coordinates are valid and predicate the
988              * surface read on the result,
989              */
990             const brw_predicate pred =
991                emit_untyped_image_check(bld, image,
992                                         emit_bounds_check(bld, image,
993                                                           saddr, dims));
994 
995             /* and they don't know about surface coordinates, we need to
996              * convert them to a raw memory offset.
997              */
998             const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
999 
1000             tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
1001 
1002             /* An out of bounds surface access should give zero as result. */
1003             for (unsigned c = 0; c < size; ++c)
1004                set_predicate(pred, bld.SEL(offset(tmp, bld, c),
1005                                            offset(tmp, bld, c), brw_imm_d(0)));
1006          }
1007 
1008          /* Set the register type to D instead of UD if the data type is
1009           * represented as a signed integer in memory so that sign extension
1010           * is handled correctly by unpack.
1011           */
1012          if (needs_sign_extension(format))
1013             tmp = retype(tmp, BRW_REGISTER_TYPE_D);
1014 
1015          if (!has_supported_bit_layout(devinfo, format)) {
1016             /* Unpack individual vector components from the bitfield if the
1017              * hardware is unable to do it for us.
1018              */
1019             if (has_split_bit_layout(devinfo, format))
1020                tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
1021                                get_bit_widths(lower_format));
1022             else
1023                tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
1024                                  get_bit_widths(format));
1025 
1026          } else if ((needs_sign_extension(format) &&
1027                      !is_conversion_trivial(devinfo, format)) ||
1028                     has_undefined_high_bits(devinfo, format)) {
1029             /* Perform a trivial unpack even though the bit layout matches in
1030              * order to get the most significant bits of each component
1031              * initialized properly.
1032              */
1033             tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
1034                               get_bit_widths(format));
1035          }
1036 
1037          if (!isl_format_has_int_channel(format)) {
1038             if (is_conversion_trivial(devinfo, format)) {
1039                /* Just need to cast the vector to the target type. */
1040                tmp = retype(tmp, BRW_REGISTER_TYPE_F);
1041             } else {
1042                /* Do the right sort of type conversion to float. */
1043                if (isl_format_has_float_channel(format))
1044                   tmp = emit_convert_from_float(
1045                      bld, tmp, get_bit_widths(format));
1046                else
1047                   tmp = emit_convert_from_scaled(
1048                      bld, tmp, get_bit_widths(format),
1049                      isl_format_has_snorm_channel(format));
1050             }
1051          }
1052 
1053          /* Initialize missing components of the result. */
1054          return emit_pad(bld, tmp, get_bit_widths(format));
1055       }
1056 
1057       /**
1058        * Store a vector in a surface of the given format and dimensionality at
1059        * the given coordinates.  \p surf_dims and \p arr_dims give the number
1060        * of non-array and array coordinates of the image respectively.
1061        */
1062       void
emit_image_store(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,const fs_reg & src,unsigned surf_dims,unsigned arr_dims,unsigned gl_format)1063       emit_image_store(const fs_builder &bld, const fs_reg &image,
1064                        const fs_reg &addr, const fs_reg &src,
1065                        unsigned surf_dims, unsigned arr_dims,
1066                        unsigned gl_format)
1067       {
1068          using namespace image_format_info;
1069          using namespace image_format_conversion;
1070          using namespace image_validity;
1071          using namespace image_coordinates;
1072          using namespace surface_access;
1073          const isl_format format = isl_format_for_gl_format(gl_format);
1074          const gen_device_info *devinfo = bld.shader->devinfo;
1075 
1076          /* Transform the image coordinates into actual surface coordinates. */
1077          const fs_reg saddr =
1078             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
1079          const unsigned dims =
1080             num_image_coordinates(bld, surf_dims, arr_dims, format);
1081 
1082          if (gl_format == GL_NONE) {
1083             /* We don't know what the format is, but that's fine because it
1084              * implies write-only access, and typed surface writes are always
1085              * able to take care of type conversion and packing for us.
1086              */
1087             emit_typed_write(bld, image, saddr, src, dims, 4);
1088 
1089          } else {
1090             const isl_format lower_format =
1091                isl_lower_storage_image_format(devinfo, format);
1092             fs_reg tmp = src;
1093 
1094             if (!is_conversion_trivial(devinfo, format)) {
1095                /* Do the right sort of type conversion. */
1096                if (isl_format_has_float_channel(format))
1097                   tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
1098 
1099                else if (isl_format_has_int_channel(format))
1100                   tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
1101                                                 isl_format_has_sint_channel(format));
1102 
1103                else
1104                   tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
1105                                                isl_format_has_snorm_channel(format));
1106             }
1107 
1108             /* We're down to bit manipulation at this point. */
1109             tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
1110 
1111             if (!has_supported_bit_layout(devinfo, format)) {
1112                /* Pack the vector components into a bitfield if the hardware
1113                 * is unable to do it for us.
1114                 */
1115                if (has_split_bit_layout(devinfo, format))
1116                   tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
1117                                     get_bit_widths(lower_format));
1118 
1119                else
1120                   tmp = emit_pack(bld, tmp, get_bit_shifts(format),
1121                                   get_bit_widths(format));
1122             }
1123 
1124             if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
1125                /* Hopefully we get here most of the time... */
1126                emit_typed_write(bld, image, saddr, tmp, dims,
1127                                 isl_format_get_num_channels(lower_format));
1128 
1129             } else {
1130                /* Untyped surface writes store 32 bits of the surface per
1131                 * component, without any sort of packing or type conversion,
1132                 */
1133                const unsigned size = isl_format_get_layout(format)->bpb / 32;
1134 
1135                /* they don't properly handle out of bounds access, so we have
1136                 * to check manually if the coordinates are valid and predicate
1137                 * the surface write on the result,
1138                 */
1139                const brw_predicate pred =
1140                   emit_untyped_image_check(bld, image,
1141                                            emit_bounds_check(bld, image,
1142                                                              saddr, dims));
1143 
1144                /* and, phew, they don't know about surface coordinates, we
1145                 * need to convert them to a raw memory offset.
1146                 */
1147                const fs_reg laddr = emit_address_calculation(
1148                   bld, image, saddr, dims);
1149 
1150                emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
1151             }
1152          }
1153       }
1154 
1155       /**
1156        * Perform an atomic read-modify-write operation in a surface of the
1157        * given dimensionality at the given coordinates.  \p surf_dims and \p
1158        * arr_dims give the number of non-array and array coordinates of the
1159        * image respectively.  Main building block of the imageAtomic GLSL
1160        * built-ins.
1161        */
1162       fs_reg
emit_image_atomic(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned surf_dims,unsigned arr_dims,unsigned rsize,unsigned op)1163       emit_image_atomic(const fs_builder &bld,
1164                         const fs_reg &image, const fs_reg &addr,
1165                         const fs_reg &src0, const fs_reg &src1,
1166                         unsigned surf_dims, unsigned arr_dims,
1167                         unsigned rsize, unsigned op)
1168       {
1169          using namespace image_validity;
1170          using namespace image_coordinates;
1171          using namespace surface_access;
1172          /* Avoid performing an atomic operation on an unbound surface. */
1173          const brw_predicate pred = emit_typed_atomic_check(bld, image);
1174 
1175          /* Transform the image coordinates into actual surface coordinates. */
1176          const fs_reg saddr =
1177             emit_image_coordinates(bld, addr, surf_dims, arr_dims,
1178                                    ISL_FORMAT_R32_UINT);
1179          const unsigned dims =
1180             num_image_coordinates(bld, surf_dims, arr_dims,
1181                                   ISL_FORMAT_R32_UINT);
1182 
1183          /* Thankfully we can do without untyped atomics here. */
1184          const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
1185                                               dims, rsize, op, pred);
1186 
1187          /* An unbound surface access should give zero as result. */
1188          if (rsize && pred)
1189             set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
1190 
1191          return retype(tmp, src0.type);
1192       }
1193    }
1194 }
1195