1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #ifndef INDEXING_UTILS_H
10 #define INDEXING_UTILS_H
11
12 /*
13 * The functions defined in this header file use the following shorthand to
14 * represent tensor related data structures.
15 *
16 * tidx - ivec4 tensor indices, listed in WHCN order.
17 *
18 * pos - ivec3 texel position, used to fetch from an image texture via the
19 * texelFetch(image, pos, lod) GLSL function.
20 * posi - ivec4 texel element position. It is the same as pos, except with an
21 * additional component of the index of an element within the texel.
22 * lpos - ivec3 logical position, listed in WHC order. This is a permutation of
23 * texture position based on a tensor's axis_map. lpos.x is the position
24 * component that corresponds to the tensor's width dimension, lpos.y is
25 * the position component that corresponds to the tensor's height dim,
26 * and so on.
27 *
28 * bufi - int index into a GPU buffer that backs a tensor.
29 * nchwi - int index into a staging buffer for a tensor. The data in the
30 * staging buffer is stored in contiguous data layout, irrespective of
31 * the tensor's strides.
32 */
33
34 // Width Dim Index, assuming WHCN order
35 #define W_DIM 0
36 // Height, assuming WHCN order
37 #define H_DIM 1
38 // Channels, assuming WHCN order
39 #define C_DIM 2
40
41 /*
42 * Fast division by 4 using bit shifting
43 */
44 #define div4(x) (x >> 2)
45
46 /*
47 * Divides input and rounds up to 4
48 */
49 #define divup4(x) ((x + 3) >> 2)
50
51 /*
52 * Aligns input to the next multiple of 4
53 */
54 #define alignup4(x) ((x + 3) & -4)
55
56 /*
57 * Fast modulo by 4 using bit masking
58 */
59 #define mod4(x) (x & 3)
60
61 /*
62 * Find the packed dimension of a tensor given its strides. The packed dimension
63 * is the "fastest moving" dimension which will have a stride of 1.
64 */
find_packed_dim(const ivec4 strides)65 int find_packed_dim(const ivec4 strides) {
66 int packed_dim = 0;
67 for (int i = 0; i <= 3; i++) {
68 if (strides[i] == 1) {
69 packed_dim = i;
70 break;
71 }
72 }
73 return packed_dim;
74 }
75
76 /*
77 * Get the staging buffer indices that contain the data of the texel that
78 * corresponds to the provided tensor index. Since the texel have 4 elements,
79 * 4 buffer indices will be retrieved.
80 */
tidx_to_nchwi(const ivec4 tidx,const ivec4 sizes,const int packed_dim)81 ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) {
82 ivec4 strides =
83 ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z);
84
85 int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
86 tidx.w * strides.w;
87
88 return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
89 }
90
91 /*
92 * Get the buffer indices that contain the data of the texel that corresponds to
93 * to the provided tensor index. Since the texel have 4 elements, 4 buffer
94 * indices will be retrieved.
95 */
tidx_to_4bufi(const ivec4 tidx,const ivec4 strides,const int packed_dim)96 ivec4 tidx_to_4bufi(
97 const ivec4 tidx,
98 const ivec4 strides,
99 const int packed_dim) {
100 int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
101 tidx.w * strides.w;
102
103 return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
104 }
105
nchwi_to_tidx(const int nchwi,const ivec4 sizes)106 ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) {
107 return ivec4(
108 nchwi % sizes.x,
109 (nchwi / (sizes.x)) % sizes.y,
110 (nchwi / (sizes.x * sizes.y)) % sizes.z,
111 (nchwi / (sizes.x * sizes.y * sizes.z)));
112 }
113
tidx_to_nchwi(const ivec4 tidx,const ivec4 sizes)114 int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) {
115 return tidx.w * sizes.x * sizes.y * sizes.z + tidx.z * sizes.x * sizes.y +
116 tidx.y * sizes.x + tidx.x;
117 }
118
119 // TODO(ssjia): make this function use dim order so that it can work with any
120 // dim order. Currently it assumes that the dim order is contiguous, except for
121 // the packed dim.
bufi_to_tidx(int bufi,const ivec4 strides,const int packed_dim)122 ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) {
123 ivec4 idx;
124 for (int i = 3; i >= 0; i--) {
125 if (i != packed_dim) {
126 idx[i] = bufi / strides[i];
127 bufi %= strides[i];
128 }
129 }
130 idx[packed_dim] = bufi;
131 return idx;
132 }
133
134 // Convenience overload of the above function, which will determine the packed
135 // dim from the strides automatically so it doesn't have to be passed in as a
136 // function argument.
bufi_to_tidx(const int bufi,const ivec4 strides)137 ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) {
138 int packed_dim = find_packed_dim(strides);
139 return bufi_to_tidx(bufi, strides, packed_dim);
140 }
141
tidx_to_bufi(const ivec4 tidx,ivec4 strides)142 int tidx_to_bufi(const ivec4 tidx, ivec4 strides) {
143 return tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
144 tidx.w * strides.w;
145 }
146
lpos_to_tidx(ivec3 lpos,ivec4 sizes,const int batch_inner_dim,const int packed_dim)147 ivec4 lpos_to_tidx(
148 ivec3 lpos,
149 ivec4 sizes,
150 const int batch_inner_dim,
151 const int packed_dim) {
152 // Align packed dim to next multiple of 4 to account for texel padding
153 sizes[packed_dim] = alignup4(sizes[packed_dim]);
154 // Moving 1 texel along the packed dim traverses 4 tensor elements
155 lpos[packed_dim] *= 4;
156
157 ivec4 tidx = ivec4(lpos, 0);
158
159 if (sizes.w > 1) {
160 tidx.w = tidx[batch_inner_dim] / sizes[batch_inner_dim];
161 tidx[batch_inner_dim] %= sizes[batch_inner_dim];
162 }
163 return tidx;
164 }
165
tidx_to_lpos(ivec4 tidx,ivec4 sizes,const int batch_inner_dim,const int packed_dim)166 ivec3 tidx_to_lpos(
167 ivec4 tidx,
168 ivec4 sizes,
169 const int batch_inner_dim,
170 const int packed_dim) {
171 // Align packed dim to next multiple of 4 to account for texel padding
172 sizes[packed_dim] = alignup4(sizes[packed_dim]);
173
174 ivec3 lpos = tidx.xyz;
175
176 // Adjust batch inner dim by batch index if needed
177 if (sizes.w > 1) {
178 lpos[batch_inner_dim] += tidx.w * sizes[batch_inner_dim];
179 }
180 // Fast division by 4, since moving 1 texel along the packed dim traverses 4
181 // tensor elements.
182 lpos[packed_dim] >>= 2;
183 return lpos;
184 }
185
tidx_to_pos(ivec4 tidx,ivec4 sizes,const ivec4 axis_map,const int packed_dim)186 ivec3 tidx_to_pos(
187 ivec4 tidx,
188 ivec4 sizes,
189 const ivec4 axis_map,
190 const int packed_dim) {
191 // Align packed dim to next multiple of 4 to account for texel padding
192 sizes[packed_dim] = alignup4(sizes[packed_dim]);
193
194 ivec3 pos;
195 for (int dim = 0; dim < 3; ++dim) {
196 pos[axis_map[dim]] = tidx[dim];
197 }
198
199 // Adjust batch inner dim by batch index if needed
200 if (sizes.w > 1) {
201 pos[axis_map[axis_map.w]] += tidx.w * sizes[axis_map.w];
202 }
203 // Fast division by 4, since moving 1 texel along the packed dim traverses 4
204 // tensor elements.
205 pos[axis_map[packed_dim]] >>= 2;
206 return pos;
207 }
208
tidx_to_posi(ivec4 tidx,ivec4 sizes,const ivec4 axis_map,const int packed_dim)209 ivec4 tidx_to_posi(
210 ivec4 tidx,
211 ivec4 sizes,
212 const ivec4 axis_map,
213 const int packed_dim) {
214 return ivec4(
215 tidx_to_pos(tidx, sizes, axis_map, packed_dim), tidx[packed_dim] % 4);
216 }
217
lpos_to_pos(const ivec3 lpos,const ivec4 axis_map)218 ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
219 ivec3 pos;
220 pos[axis_map.x] = lpos.x;
221 pos[axis_map.y] = lpos.y;
222 pos[axis_map.z] = lpos.z;
223 return pos;
224 }
225
226 #ifdef USING_BUFFER
227 #define load_texel(buf, idx) buf[idx]
228 #elif defined(USING_TEXTURE2D)
229 #define load_texel(im, pos) texelFetch(im, pos.xy, 0)
230 #define load_texel_lpos(im, lpos, axis_map) \
231 texelFetch(im, lpos_to_pos(lpos, axis_map).xy, 0)
232 #else // defined(USING_TEXTURE3D)
233 #define load_texel(im, pos) texelFetch(im, pos, 0)
234 #define load_texel_lpos(im, lpos, axis_map) \
235 texelFetch(im, lpos_to_pos(lpos, axis_map), 0)
236 #endif
237
238 #ifdef USING_BUFFER
239 #define write_texel(buf, idx, texel) buf[idx] = texel
240 #elif defined(USING_TEXTURE2D)
241 #define write_texel(im, pos, texel) imageStore(im, pos.xy, texel)
242 #define write_texel_lpos(im, lpos, texel, axis_map) \
243 imageStore(im, lpos_to_pos(lpos, axis_map).xy, texel)
244 #else // defined(USING_TEXTURE3D)
245 #define write_texel(im, pos, texel) imageStore(im, pos, texel)
246 #define write_texel_lpos(im, lpos, texel, axis_map) \
247 imageStore(im, lpos_to_pos(lpos, axis_map), texel)
248 #endif
249
250 /*
251 * Converts hashed layout to a ivec4 containing the axis map data and an int
252 * containing the packed dim respectively. Each value takes up 4 bits in the
253 * packed int, and values are read from least significant half byte (right-most)
254 * to most significant half byte (left-most).
255 * e.g. 0x20122, 2 -> ivec4(0, 1, 2, 2)
256 * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1)
257 */
258 #define unhash_axis_map(hash) \
259 ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))
260
261 #define unhash_packed_dim(hash) int(hash >> 16 & 0xf)
262
263 #define DEFAULT_LAYOUT 0x02210
264
265 /************************
266 * Deprecated Functions *
267 ************************/
268
269 // The below functions and macros are in the process of being deprecated in
270 // favor of newer indexing functions that account for axis mapping and have more
271 // explicit function names and more updated terminology.
272
273 /*
274 * Describes which texture axis the "batches" dimension runs along in a 4D
275 * texture.
276 *
277 * Currently it is set to 2 since we represent batches by concatenating along
278 * the channels dim, which has index 2 in (W, H, C, N) order and maps to the
279 * depth dimension of a texture, which also corresponds to index 2 in (x, y, z)
280 * order.
281 */
282 #define BATCH_AXIS 2
283
284 //
285 // (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion
286 //
287
288 /*
289 * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, which dim
290 * is packed along a texel
291 * Output: Whether the texel position is outside the bounds of the image texture
292 * given the size and packed dimension of the tensor.
293 */
pos_out_of_bounds(ivec3 pos,ivec4 sizes,int packed_dim)294 bool pos_out_of_bounds(ivec3 pos, ivec4 sizes, int packed_dim) {
295 // Align packed dim to next multiple of 4 to account for texel padding
296 sizes[packed_dim] = alignup4(sizes[packed_dim]);
297
298 ivec3 max_pos = sizes.xyz;
299 max_pos[BATCH_AXIS] += sizes.w * sizes[BATCH_AXIS];
300 max_pos[packed_dim] /= 4;
301 return (any(greaterThanEqual(pos, max_pos)));
302 }
303
304 /*
305 * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor,
306 * which dim is packed along a texel
307 * Returns: the (w, h, c, n) tensor index cooresponding to the first element of
308 * the texel at the specified position
309 */
to_tensor_idx(ivec3 pos,ivec4 sizes,int packed_dim)310 ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
311 // Align packed dim to next multiple of 4 to account for texel padding
312 sizes[packed_dim] = alignup4(sizes[packed_dim]);
313
314 // Packed dim contains 4 elements per texel
315 pos[packed_dim] *= 4;
316 // Construct the initial tensor index via swizzling
317 #if BATCH_AXIS == 2
318 ivec4 tensor_idx = pos.xyzz;
319 #endif
320 #if BATCH_AXIS == 1
321 ivec4 tensor_idx = pos.xyzy;
322 #endif
323 #if BATCH_AXIS == 0
324 ivec4 tensor_idx = pos.xyzx;
325 #endif
326 // Adjust the axis that the batch dim runs along
327 tensor_idx[3] /= sizes[BATCH_AXIS];
328 tensor_idx[BATCH_AXIS] %= sizes[BATCH_AXIS];
329
330 return tensor_idx;
331 }
332
333 /*
334 * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
335 * is packed along a texel
336 * Returns: the (x, y, z) texture position containing element of the tensor at
337 * the specified index
338 */
to_texture_pos(ivec4 idx,ivec4 sizes,int packed_dim)339 ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
340 // Align packed dim to next multiple of 4 to account for texel padding
341 sizes[packed_dim] = alignup4(sizes[packed_dim]);
342
343 ivec3 pos = idx.xyz;
344 pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS];
345 pos[packed_dim] /= 4;
346 return pos;
347 }
348
349 /*
350 * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim
351 * is packed along a texel
352 * Returns: the (x, y, z, i) texture position containing the element of the
353 * tensor at the specified index, where i is the component within the
354 * texel to which the element belongs
355 */
to_texture_elem_pos(ivec4 idx,ivec4 sizes,int packed_dim)356 ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
357 // Align packed dim to next multiple of 4 to account for texel padding
358 sizes[packed_dim] = alignup4(sizes[packed_dim]);
359
360 // pos[4] is set to a placeholder value
361 ivec4 pos = idx.xyzx;
362 pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS];
363 pos[packed_dim] /= 4;
364 pos.w = idx[packed_dim] % 4;
365 return pos;
366 }
367
368 //
369 // Miscellaneous Utility Functions and Macros
370 //
371
372 // Given a buffer(1-D) index cur, compute a new index where the corresponding
373 // tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane
374 // describe sizes. As an example, let's say we want to swap dimensions 0,1 for a
375 // tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and
376 // plane=2*24=48.
377 #define swap_adj_dims(cur, x, y, plane) \
378 cur + \
379 plane * \
380 ((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \
381 (x - 1) * ((cur % (y * plane)) / plane))
382
383 // Return the x, y, z and index value the channel-packed 3D tensor from the {n,
384 // c, h, w}-index.
get_channel_packed_pos_from_index(ivec4 nchw,ivec4 sizes)385 ivec4 get_channel_packed_pos_from_index(ivec4 nchw, ivec4 sizes) {
386 int aligned_c = alignup4(sizes.y);
387 int c_stride = aligned_c / 4;
388
389 return ivec4(nchw.w, nchw.z, nchw.x * c_stride + nchw.y / 4, nchw.y % 4);
390 }
391
392 #endif // INDEXING_UTILS_H
393