• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #ifndef INDEXING_UTILS_H
10 #define INDEXING_UTILS_H
11 
12 /*
13  * The functions defined in this header file use the following shorthand to
14  * represent tensor related data structures.
15  *
16  * tidx  - ivec4 tensor indices, listed in WHCN order.
17  *
18  * pos   - ivec3 texel position, used to fetch from an image texture via the
19  *         texelFetch(image, pos, lod) GLSL function.
20  * posi  - ivec4 texel element position. It is the same as pos, except with an
21  *         additional component of the index of an element within the texel.
22  * lpos  - ivec3 logical position, listed in WHC order. This is a permutation of
23  *         texture position based on a tensor's axis_map. lpos.x is the position
24  *         component that corresponds to the tensor's width dimension, lpos.y is
25  *         the position component that corresponds to the tensor's height dim,
26  *         and so on.
27  *
28  * bufi  - int index into a GPU buffer that backs a tensor.
29  * nchwi - int index into a staging buffer for a tensor. The data in the
30  *         staging buffer is stored in contiguous data layout, irrespective of
31  *         the tensor's strides.
32  */
33 
34 // Width Dim Index, assuming WHCN order
35 #define W_DIM 0
36 // Height, assuming WHCN order
37 #define H_DIM 1
38 // Channels, assuming WHCN order
39 #define C_DIM 2
40 
41 /*
42  * Fast division by 4 using bit shifting
43  */
44 #define div4(x) (x >> 2)
45 
46 /*
47  * Divides input and rounds up to 4
48  */
49 #define divup4(x) ((x + 3) >> 2)
50 
51 /*
52  * Aligns input to the next multiple of 4
53  */
54 #define alignup4(x) ((x + 3) & -4)
55 
56 /*
57  * Fast modulo by 4 using bit masking
58  */
59 #define mod4(x) (x & 3)
60 
61 /*
62  * Find the packed dimension of a tensor given its strides. The packed dimension
63  * is the "fastest moving" dimension which will have a stride of 1.
64  */
find_packed_dim(const ivec4 strides)65 int find_packed_dim(const ivec4 strides) {
66   int packed_dim = 0;
67   for (int i = 0; i <= 3; i++) {
68     if (strides[i] == 1) {
69       packed_dim = i;
70       break;
71     }
72   }
73   return packed_dim;
74 }
75 
76 /*
77  * Get the staging buffer indices that contain the data of the texel that
78  * corresponds to the provided tensor index. Since the texel have 4 elements,
79  * 4 buffer indices will be retrieved.
80  */
tidx_to_nchwi(const ivec4 tidx,const ivec4 sizes,const int packed_dim)81 ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) {
82   ivec4 strides =
83       ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z);
84 
85   int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
86       tidx.w * strides.w;
87 
88   return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
89 }
90 
91 /*
92  * Get the buffer indices that contain the data of the texel that corresponds to
93  * to the provided tensor index. Since the texel have 4 elements, 4 buffer
94  * indices will be retrieved.
95  */
tidx_to_4bufi(const ivec4 tidx,const ivec4 strides,const int packed_dim)96 ivec4 tidx_to_4bufi(
97     const ivec4 tidx,
98     const ivec4 strides,
99     const int packed_dim) {
100   int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
101       tidx.w * strides.w;
102 
103   return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
104 }
105 
nchwi_to_tidx(const int nchwi,const ivec4 sizes)106 ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) {
107   return ivec4(
108       nchwi % sizes.x,
109       (nchwi / (sizes.x)) % sizes.y,
110       (nchwi / (sizes.x * sizes.y)) % sizes.z,
111       (nchwi / (sizes.x * sizes.y * sizes.z)));
112 }
113 
tidx_to_nchwi(const ivec4 tidx,const ivec4 sizes)114 int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) {
115   return tidx.w * sizes.x * sizes.y * sizes.z + tidx.z * sizes.x * sizes.y +
116       tidx.y * sizes.x + tidx.x;
117 }
118 
119 // TODO(ssjia): make this function use dim order so that it can work with any
120 // dim order. Currently it assumes that the dim order is contiguous, except for
121 // the packed dim.
bufi_to_tidx(int bufi,const ivec4 strides,const int packed_dim)122 ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) {
123   ivec4 idx;
124   for (int i = 3; i >= 0; i--) {
125     if (i != packed_dim) {
126       idx[i] = bufi / strides[i];
127       bufi %= strides[i];
128     }
129   }
130   idx[packed_dim] = bufi;
131   return idx;
132 }
133 
134 // Convenience overload of the above function, which will determine the packed
135 // dim from the strides automatically so it doesn't have to be passed in as a
136 // function argument.
bufi_to_tidx(const int bufi,const ivec4 strides)137 ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) {
138   int packed_dim = find_packed_dim(strides);
139   return bufi_to_tidx(bufi, strides, packed_dim);
140 }
141 
tidx_to_bufi(const ivec4 tidx,ivec4 strides)142 int tidx_to_bufi(const ivec4 tidx, ivec4 strides) {
143   return tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
144       tidx.w * strides.w;
145 }
146 
lpos_to_tidx(ivec3 lpos,ivec4 sizes,const int batch_inner_dim,const int packed_dim)147 ivec4 lpos_to_tidx(
148     ivec3 lpos,
149     ivec4 sizes,
150     const int batch_inner_dim,
151     const int packed_dim) {
152   // Align packed dim to next multiple of 4 to account for texel padding
153   sizes[packed_dim] = alignup4(sizes[packed_dim]);
154   // Moving 1 texel along the packed dim traverses 4 tensor elements
155   lpos[packed_dim] *= 4;
156 
157   ivec4 tidx = ivec4(lpos, 0);
158 
159   if (sizes.w > 1) {
160     tidx.w = tidx[batch_inner_dim] / sizes[batch_inner_dim];
161     tidx[batch_inner_dim] %= sizes[batch_inner_dim];
162   }
163   return tidx;
164 }
165 
tidx_to_lpos(ivec4 tidx,ivec4 sizes,const int batch_inner_dim,const int packed_dim)166 ivec3 tidx_to_lpos(
167     ivec4 tidx,
168     ivec4 sizes,
169     const int batch_inner_dim,
170     const int packed_dim) {
171   // Align packed dim to next multiple of 4 to account for texel padding
172   sizes[packed_dim] = alignup4(sizes[packed_dim]);
173 
174   ivec3 lpos = tidx.xyz;
175 
176   // Adjust batch inner dim by batch index if needed
177   if (sizes.w > 1) {
178     lpos[batch_inner_dim] += tidx.w * sizes[batch_inner_dim];
179   }
180   // Fast division by 4, since moving 1 texel along the packed dim traverses 4
181   // tensor elements.
182   lpos[packed_dim] >>= 2;
183   return lpos;
184 }
185 
tidx_to_pos(ivec4 tidx,ivec4 sizes,const ivec4 axis_map,const int packed_dim)186 ivec3 tidx_to_pos(
187     ivec4 tidx,
188     ivec4 sizes,
189     const ivec4 axis_map,
190     const int packed_dim) {
191   // Align packed dim to next multiple of 4 to account for texel padding
192   sizes[packed_dim] = alignup4(sizes[packed_dim]);
193 
194   ivec3 pos;
195   for (int dim = 0; dim < 3; ++dim) {
196     pos[axis_map[dim]] = tidx[dim];
197   }
198 
199   // Adjust batch inner dim by batch index if needed
200   if (sizes.w > 1) {
201     pos[axis_map[axis_map.w]] += tidx.w * sizes[axis_map.w];
202   }
203   // Fast division by 4, since moving 1 texel along the packed dim traverses 4
204   // tensor elements.
205   pos[axis_map[packed_dim]] >>= 2;
206   return pos;
207 }
208 
tidx_to_posi(ivec4 tidx,ivec4 sizes,const ivec4 axis_map,const int packed_dim)209 ivec4 tidx_to_posi(
210     ivec4 tidx,
211     ivec4 sizes,
212     const ivec4 axis_map,
213     const int packed_dim) {
214   return ivec4(
215       tidx_to_pos(tidx, sizes, axis_map, packed_dim), tidx[packed_dim] % 4);
216 }
217 
lpos_to_pos(const ivec3 lpos,const ivec4 axis_map)218 ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
219   ivec3 pos;
220   pos[axis_map.x] = lpos.x;
221   pos[axis_map.y] = lpos.y;
222   pos[axis_map.z] = lpos.z;
223   return pos;
224 }
225 
226 #ifdef USING_BUFFER
227 #define load_texel(buf, idx) buf[idx]
228 #elif defined(USING_TEXTURE2D)
229 #define load_texel(im, pos) texelFetch(im, pos.xy, 0)
230 #define load_texel_lpos(im, lpos, axis_map) \
231   texelFetch(im, lpos_to_pos(lpos, axis_map).xy, 0)
232 #else // defined(USING_TEXTURE3D)
233 #define load_texel(im, pos) texelFetch(im, pos, 0)
234 #define load_texel_lpos(im, lpos, axis_map) \
235   texelFetch(im, lpos_to_pos(lpos, axis_map), 0)
236 #endif
237 
238 #ifdef USING_BUFFER
239 #define write_texel(buf, idx, texel) buf[idx] = texel
240 #elif defined(USING_TEXTURE2D)
241 #define write_texel(im, pos, texel) imageStore(im, pos.xy, texel)
242 #define write_texel_lpos(im, lpos, texel, axis_map) \
243   imageStore(im, lpos_to_pos(lpos, axis_map).xy, texel)
244 #else // defined(USING_TEXTURE3D)
245 #define write_texel(im, pos, texel) imageStore(im, pos, texel)
246 #define write_texel_lpos(im, lpos, texel, axis_map) \
247   imageStore(im, lpos_to_pos(lpos, axis_map), texel)
248 #endif
249 
250 /*
251  * Converts hashed layout to a ivec4 containing the axis map data and an int
252  * containing the packed dim respectively. Each value takes up 4 bits in the
253  * packed int, and values are read from least significant half byte (right-most)
254  * to most significant half byte (left-most).
255  * e.g. 0x20122, 2 -> ivec4(0, 1, 2, 2)
256  * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1)
257  */
258 #define unhash_axis_map(hash) \
259   ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))
260 
261 #define unhash_packed_dim(hash) int(hash >> 16 & 0xf)
262 
263 #define DEFAULT_LAYOUT 0x02210
264 
265 /************************
266  * Deprecated Functions *
267  ************************/
268 
269 // The below functions and macros are in the process of being deprecated in
270 // favor of newer indexing functions that account for axis mapping and have more
271 // explicit function names and more updated terminology.
272 
273 /*
274  * Describes which texture axis the "batches" dimension runs along in a 4D
275  * texture.
276  *
277  * Currently it is set to 2 since we represent batches by concatenating along
278  * the channels dim, which has index 2 in (W, H, C, N) order and maps to the
279  * depth dimension of a texture, which also corresponds to index 2 in (x, y, z)
280  * order.
281  */
282 #define BATCH_AXIS 2
283 
284 //
285 // (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion
286 //
287 
288 /*
289  * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, which dim
290  *        is packed along a texel
291  * Output: Whether the texel position is outside the bounds of the image texture
292  *         given the size and packed dimension of the tensor.
293  */
pos_out_of_bounds(ivec3 pos,ivec4 sizes,int packed_dim)294 bool pos_out_of_bounds(ivec3 pos, ivec4 sizes, int packed_dim) {
295   // Align packed dim to next multiple of 4 to account for texel padding
296   sizes[packed_dim] = alignup4(sizes[packed_dim]);
297 
298   ivec3 max_pos = sizes.xyz;
299   max_pos[BATCH_AXIS] += sizes.w * sizes[BATCH_AXIS];
300   max_pos[packed_dim] /= 4;
301   return (any(greaterThanEqual(pos, max_pos)));
302 }
303 
304 /*
305  * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor,
306  *        which dim is packed along a texel
307  * Returns: the (w, h, c, n) tensor index cooresponding to the first element of
308  *          the texel at the specified position
309  */
to_tensor_idx(ivec3 pos,ivec4 sizes,int packed_dim)310 ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
311   // Align packed dim to next multiple of 4 to account for texel padding
312   sizes[packed_dim] = alignup4(sizes[packed_dim]);
313 
314   // Packed dim contains 4 elements per texel
315   pos[packed_dim] *= 4;
316   // Construct the initial tensor index via swizzling
317 #if BATCH_AXIS == 2
318   ivec4 tensor_idx = pos.xyzz;
319 #endif
320 #if BATCH_AXIS == 1
321   ivec4 tensor_idx = pos.xyzy;
322 #endif
323 #if BATCH_AXIS == 0
324   ivec4 tensor_idx = pos.xyzx;
325 #endif
326   // Adjust the axis that the batch dim runs along
327   tensor_idx[3] /= sizes[BATCH_AXIS];
328   tensor_idx[BATCH_AXIS] %= sizes[BATCH_AXIS];
329 
330   return tensor_idx;
331 }
332 
333 /*
334  * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
335  *        is packed along a texel
336  * Returns: the (x, y, z) texture position containing element of the tensor at
337  *          the specified index
338  */
to_texture_pos(ivec4 idx,ivec4 sizes,int packed_dim)339 ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
340   // Align packed dim to next multiple of 4 to account for texel padding
341   sizes[packed_dim] = alignup4(sizes[packed_dim]);
342 
343   ivec3 pos = idx.xyz;
344   pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS];
345   pos[packed_dim] /= 4;
346   return pos;
347 }
348 
349 /*
350  * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim
351  *        is packed along a texel
352  * Returns: the (x, y, z, i) texture position containing the element of the
353  *          tensor at the specified index, where i is the component within the
354  *          texel to which the element belongs
355  */
to_texture_elem_pos(ivec4 idx,ivec4 sizes,int packed_dim)356 ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
357   // Align packed dim to next multiple of 4 to account for texel padding
358   sizes[packed_dim] = alignup4(sizes[packed_dim]);
359 
360   //  pos[4] is set to a placeholder value
361   ivec4 pos = idx.xyzx;
362   pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS];
363   pos[packed_dim] /= 4;
364   pos.w = idx[packed_dim] % 4;
365   return pos;
366 }
367 
368 //
369 // Miscellaneous Utility Functions and Macros
370 //
371 
372 // Given a buffer(1-D) index cur, compute a new index where the corresponding
373 // tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane
374 // describe sizes. As an example, let's say we want to swap dimensions 0,1 for a
375 // tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and
376 // plane=2*24=48.
377 #define swap_adj_dims(cur, x, y, plane)                        \
378   cur +                                                        \
379       plane *                                                  \
380           ((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \
381            (x - 1) * ((cur % (y * plane)) / plane))
382 
383 // Return the x, y, z and index value the channel-packed 3D tensor from the {n,
384 // c, h, w}-index.
get_channel_packed_pos_from_index(ivec4 nchw,ivec4 sizes)385 ivec4 get_channel_packed_pos_from_index(ivec4 nchw, ivec4 sizes) {
386   int aligned_c = alignup4(sizes.y);
387   int c_stride = aligned_c / 4;
388 
389   return ivec4(nchw.w, nchw.z, nchw.x * c_stride + nchw.y / 4, nchw.y % 4);
390 }
391 
392 #endif // INDEXING_UTILS_H
393