1 /*
2 * Copyright © 2020 Google LLC
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /* Lowers nir_intrinsic_load_ubo() to nir_intrinsic_load_ubo_vec4() taking an
25 * offset in vec4 units. This is a fairly common mode of UBO addressing for
26 * hardware to have, and it gives NIR a chance to optimize the addressing math
27 * and CSE the loads.
28 *
29 * This pass handles lowering for loads that straddle a vec4 alignment
30 * boundary. We try to minimize the extra loads we generate for that case,
31 * and are ensured non-straddling loads with:
32 *
33 * - std140 (GLSL 1.40, GLSL ES)
34 * - Vulkan "Extended Layout" (the baseline for UBOs)
35 *
36 * but not:
37 *
38 * - GLSL 4.30's new packed mode (enabled by PIPE_CAP_LOAD_CONSTBUF) where
39 * vec3 arrays are packed tightly.
40 *
41 * - PackedDriverUniformStorage in GL (enabled by PIPE_CAP_PACKED_UNIFORMS)
42 * combined with nir_lower_uniforms_to_ubo, where values in the default
43 * uniform block are packed tightly.
44 *
45 * - Vulkan's scalarBlockLayout optional feature:
46 *
47 * "A member is defined to improperly straddle if either of the following are
48 * true:
49 *
50 * • It is a vector with total size less than or equal to 16 bytes, and has
51 * Offset decorations placing its first byte at F and its last byte at L
52 * where floor(F / 16) != floor(L / 16).
53 * • It is a vector with total size greater than 16 bytes and has its Offset
54 * decorations placing its first byte at a non-integer multiple of 16.
55 *
56 * [...]
57 *
58 * Unless the scalarBlockLayout feature is enabled on the device:
59 *
60 * • Vectors must not improperly straddle, as defined above."
61 */
62
63 #include "nir.h"
64 #include "nir_builder.h"
65
66 static bool
nir_lower_ubo_vec4_filter(const nir_instr * instr,const void * data)67 nir_lower_ubo_vec4_filter(const nir_instr *instr, const void *data)
68 {
69 if (instr->type != nir_instr_type_intrinsic)
70 return false;
71
72 return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo;
73 }
74
75 static nir_intrinsic_instr *
create_load(nir_builder * b,nir_ssa_def * block,nir_ssa_def * offset,unsigned bit_size,unsigned num_components)76 create_load(nir_builder *b, nir_ssa_def *block, nir_ssa_def *offset,
77 unsigned bit_size, unsigned num_components)
78 {
79 nir_ssa_def *def = nir_load_ubo_vec4(b, num_components, bit_size, block, offset);
80 return nir_instr_as_intrinsic(def->parent_instr);
81 }
82
83 static nir_ssa_def *
nir_lower_ubo_vec4_lower(nir_builder * b,nir_instr * instr,void * data)84 nir_lower_ubo_vec4_lower(nir_builder *b, nir_instr *instr, void *data)
85 {
86 b->cursor = nir_before_instr(instr);
87
88 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
89
90 nir_ssa_def *byte_offset = nir_ssa_for_src(b, intr->src[1], 1);
91 nir_ssa_def *vec4_offset = nir_ushr_imm(b, byte_offset, 4);
92
93 unsigned align_mul = nir_intrinsic_align_mul(intr);
94 unsigned align_offset = nir_intrinsic_align_offset(intr);
95
96 int chan_size_bytes = intr->dest.ssa.bit_size / 8;
97 int chans_per_vec4 = 16 / chan_size_bytes;
98
99 /* We don't care if someone figured out that things are aligned beyond
100 * vec4.
101 */
102 align_mul = MIN2(align_mul, 16);
103 align_offset &= 15;
104 assert(align_offset % chan_size_bytes == 0);
105
106 unsigned num_components = intr->num_components;
107 bool aligned_mul = (align_mul == 16 &&
108 align_offset + chan_size_bytes * num_components <= 16);
109 if (!aligned_mul)
110 num_components = chans_per_vec4;
111
112 nir_intrinsic_instr *load = create_load(b, intr->src[0].ssa, vec4_offset,
113 intr->dest.ssa.bit_size,
114 num_components);
115
116 nir_intrinsic_set_access(load, nir_intrinsic_access(intr));
117
118 nir_ssa_def *result = &load->dest.ssa;
119
120 int align_chan_offset = align_offset / chan_size_bytes;
121 if (aligned_mul) {
122 /* For an aligned load, just ask the backend to load from the known
123 * offset's component.
124 */
125 nir_intrinsic_set_component(load, align_chan_offset);
126 } else if (intr->num_components == 1) {
127 /* If we're loading a single component, that component alone won't
128 * straddle a vec4 boundary so we can do this with a single UBO load.
129 */
130 nir_ssa_def *component =
131 nir_iand_imm(b,
132 nir_udiv_imm(b, byte_offset, chan_size_bytes),
133 chans_per_vec4 - 1);
134
135 result = nir_vector_extract(b, result, component);
136 } else if (align_mul == 8 &&
137 align_offset + chan_size_bytes * intr->num_components <= 8) {
138 /* Special case: Loading small vectors from offset % 8 == 0 can be done
139 * with just one load and one bcsel.
140 */
141 nir_component_mask_t low_channels =
142 BITSET_MASK(intr->num_components) << (align_chan_offset);
143 nir_component_mask_t high_channels =
144 low_channels << (8 / chan_size_bytes);
145 result = nir_bcsel(b, nir_test_mask(b, byte_offset, 8),
146 nir_channels(b, result, high_channels),
147 nir_channels(b, result, low_channels));
148 } else {
149 /* General fallback case: Per-result-channel bcsel-based extraction
150 * from two separate vec4 loads.
151 */
152 assert(num_components == 4);
153 nir_ssa_def *next_vec4_offset = nir_iadd_imm(b, vec4_offset, 1);
154 nir_intrinsic_instr *next_load = create_load(b, intr->src[0].ssa, next_vec4_offset,
155 intr->dest.ssa.bit_size,
156 num_components);
157
158 nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS];
159 for (unsigned i = 0; i < intr->num_components; i++) {
160 nir_ssa_def *chan_byte_offset = nir_iadd_imm(b, byte_offset, i * chan_size_bytes);
161
162 nir_ssa_def *chan_vec4_offset = nir_ushr_imm(b, chan_byte_offset, 4);
163
164 nir_ssa_def *component =
165 nir_iand_imm(b,
166 nir_udiv_imm(b, chan_byte_offset, chan_size_bytes),
167 chans_per_vec4 - 1);
168
169 channels[i] = nir_vector_extract(b,
170 nir_bcsel(b,
171 nir_ieq(b,
172 chan_vec4_offset,
173 vec4_offset),
174 &load->dest.ssa,
175 &next_load->dest.ssa),
176 component);
177 }
178
179 result = nir_vec(b, channels, intr->num_components);
180 }
181
182 return result;
183 }
184
185 bool
nir_lower_ubo_vec4(nir_shader * shader)186 nir_lower_ubo_vec4(nir_shader *shader)
187 {
188 return nir_shader_lower_instructions(shader,
189 nir_lower_ubo_vec4_filter,
190 nir_lower_ubo_vec4_lower,
191 NULL);
192 }
193