/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #ifndef INDEXING_UTILS_H #define INDEXING_UTILS_H /* * The functions defined in this header file use the following shorthand to * represent tensor related data structures. * * tidx - ivec4 tensor indices, listed in WHCN order. * * pos - ivec3 texel position, used to fetch from an image texture via the * texelFetch(image, pos, lod) GLSL function. * posi - ivec4 texel element position. It is the same as pos, except with an * additional component of the index of an element within the texel. * lpos - ivec3 logical position, listed in WHC order. This is a permutation of * texture position based on a tensor's axis_map. lpos.x is the position * component that corresponds to the tensor's width dimension, lpos.y is * the position component that corresponds to the tensor's height dim, * and so on. * * bufi - int index into a GPU buffer that backs a tensor. * nchwi - int index into a staging buffer for a tensor. The data in the * staging buffer is stored in contiguous data layout, irrespective of * the tensor's strides. */ // Width Dim Index, assuming WHCN order #define W_DIM 0 // Height, assuming WHCN order #define H_DIM 1 // Channels, assuming WHCN order #define C_DIM 2 /* * Fast division by 4 using bit shifting */ #define div4(x) (x >> 2) /* * Divides input and rounds up to 4 */ #define divup4(x) ((x + 3) >> 2) /* * Aligns input to the next multiple of 4 */ #define alignup4(x) ((x + 3) & -4) /* * Fast modulo by 4 using bit masking */ #define mod4(x) (x & 3) /* * Find the packed dimension of a tensor given its strides. The packed dimension * is the "fastest moving" dimension which will have a stride of 1. */ int find_packed_dim(const ivec4 strides) { int packed_dim = 0; for (int i = 0; i <= 3; i++) { if (strides[i] == 1) { packed_dim = i; break; } } return packed_dim; } /* * Get the staging buffer indices that contain the data of the texel that * corresponds to the provided tensor index. Since the texel have 4 elements, * 4 buffer indices will be retrieved. */ ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) { ivec4 strides = ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z); int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + tidx.w * strides.w; return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; } /* * Get the buffer indices that contain the data of the texel that corresponds to * to the provided tensor index. Since the texel have 4 elements, 4 buffer * indices will be retrieved. */ ivec4 tidx_to_4bufi( const ivec4 tidx, const ivec4 strides, const int packed_dim) { int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + tidx.w * strides.w; return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; } ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) { return ivec4( nchwi % sizes.x, (nchwi / (sizes.x)) % sizes.y, (nchwi / (sizes.x * sizes.y)) % sizes.z, (nchwi / (sizes.x * sizes.y * sizes.z))); } int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) { return tidx.w * sizes.x * sizes.y * sizes.z + tidx.z * sizes.x * sizes.y + tidx.y * sizes.x + tidx.x; } // TODO(ssjia): make this function use dim order so that it can work with any // dim order. Currently it assumes that the dim order is contiguous, except for // the packed dim. ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) { ivec4 idx; for (int i = 3; i >= 0; i--) { if (i != packed_dim) { idx[i] = bufi / strides[i]; bufi %= strides[i]; } } idx[packed_dim] = bufi; return idx; } // Convenience overload of the above function, which will determine the packed // dim from the strides automatically so it doesn't have to be passed in as a // function argument. ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) { int packed_dim = find_packed_dim(strides); return bufi_to_tidx(bufi, strides, packed_dim); } int tidx_to_bufi(const ivec4 tidx, ivec4 strides) { return tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + tidx.w * strides.w; } ivec4 lpos_to_tidx( ivec3 lpos, ivec4 sizes, const int batch_inner_dim, const int packed_dim) { // Align packed dim to next multiple of 4 to account for texel padding sizes[packed_dim] = alignup4(sizes[packed_dim]); // Moving 1 texel along the packed dim traverses 4 tensor elements lpos[packed_dim] *= 4; ivec4 tidx = ivec4(lpos, 0); if (sizes.w > 1) { tidx.w = tidx[batch_inner_dim] / sizes[batch_inner_dim]; tidx[batch_inner_dim] %= sizes[batch_inner_dim]; } return tidx; } ivec3 tidx_to_lpos( ivec4 tidx, ivec4 sizes, const int batch_inner_dim, const int packed_dim) { // Align packed dim to next multiple of 4 to account for texel padding sizes[packed_dim] = alignup4(sizes[packed_dim]); ivec3 lpos = tidx.xyz; // Adjust batch inner dim by batch index if needed if (sizes.w > 1) { lpos[batch_inner_dim] += tidx.w * sizes[batch_inner_dim]; } // Fast division by 4, since moving 1 texel along the packed dim traverses 4 // tensor elements. lpos[packed_dim] >>= 2; return lpos; } ivec3 tidx_to_pos( ivec4 tidx, ivec4 sizes, const ivec4 axis_map, const int packed_dim) { // Align packed dim to next multiple of 4 to account for texel padding sizes[packed_dim] = alignup4(sizes[packed_dim]); ivec3 pos; for (int dim = 0; dim < 3; ++dim) { pos[axis_map[dim]] = tidx[dim]; } // Adjust batch inner dim by batch index if needed if (sizes.w > 1) { pos[axis_map[axis_map.w]] += tidx.w * sizes[axis_map.w]; } // Fast division by 4, since moving 1 texel along the packed dim traverses 4 // tensor elements. pos[axis_map[packed_dim]] >>= 2; return pos; } ivec4 tidx_to_posi( ivec4 tidx, ivec4 sizes, const ivec4 axis_map, const int packed_dim) { return ivec4( tidx_to_pos(tidx, sizes, axis_map, packed_dim), tidx[packed_dim] % 4); } ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) { ivec3 pos; pos[axis_map.x] = lpos.x; pos[axis_map.y] = lpos.y; pos[axis_map.z] = lpos.z; return pos; } #ifdef USING_BUFFER #define load_texel(buf, idx) buf[idx] #elif defined(USING_TEXTURE2D) #define load_texel(im, pos) texelFetch(im, pos.xy, 0) #define load_texel_lpos(im, lpos, axis_map) \ texelFetch(im, lpos_to_pos(lpos, axis_map).xy, 0) #else // defined(USING_TEXTURE3D) #define load_texel(im, pos) texelFetch(im, pos, 0) #define load_texel_lpos(im, lpos, axis_map) \ texelFetch(im, lpos_to_pos(lpos, axis_map), 0) #endif #ifdef USING_BUFFER #define write_texel(buf, idx, texel) buf[idx] = texel #elif defined(USING_TEXTURE2D) #define write_texel(im, pos, texel) imageStore(im, pos.xy, texel) #define write_texel_lpos(im, lpos, texel, axis_map) \ imageStore(im, lpos_to_pos(lpos, axis_map).xy, texel) #else // defined(USING_TEXTURE3D) #define write_texel(im, pos, texel) imageStore(im, pos, texel) #define write_texel_lpos(im, lpos, texel, axis_map) \ imageStore(im, lpos_to_pos(lpos, axis_map), texel) #endif /* * Converts hashed layout to a ivec4 containing the axis map data and an int * containing the packed dim respectively. Each value takes up 4 bits in the * packed int, and values are read from least significant half byte (right-most) * to most significant half byte (left-most). * e.g. 0x20122, 2 -> ivec4(0, 1, 2, 2) * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1) */ #define unhash_axis_map(hash) \ ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)) #define unhash_packed_dim(hash) int(hash >> 16 & 0xf) #define DEFAULT_LAYOUT 0x02210 /************************ * Deprecated Functions * ************************/ // The below functions and macros are in the process of being deprecated in // favor of newer indexing functions that account for axis mapping and have more // explicit function names and more updated terminology. /* * Describes which texture axis the "batches" dimension runs along in a 4D * texture. * * Currently it is set to 2 since we represent batches by concatenating along * the channels dim, which has index 2 in (W, H, C, N) order and maps to the * depth dimension of a texture, which also corresponds to index 2 in (x, y, z) * order. */ #define BATCH_AXIS 2 // // (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion // /* * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, which dim * is packed along a texel * Output: Whether the texel position is outside the bounds of the image texture * given the size and packed dimension of the tensor. */ bool pos_out_of_bounds(ivec3 pos, ivec4 sizes, int packed_dim) { // Align packed dim to next multiple of 4 to account for texel padding sizes[packed_dim] = alignup4(sizes[packed_dim]); ivec3 max_pos = sizes.xyz; max_pos[BATCH_AXIS] += sizes.w * sizes[BATCH_AXIS]; max_pos[packed_dim] /= 4; return (any(greaterThanEqual(pos, max_pos))); } /* * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, * which dim is packed along a texel * Returns: the (w, h, c, n) tensor index cooresponding to the first element of * the texel at the specified position */ ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) { // Align packed dim to next multiple of 4 to account for texel padding sizes[packed_dim] = alignup4(sizes[packed_dim]); // Packed dim contains 4 elements per texel pos[packed_dim] *= 4; // Construct the initial tensor index via swizzling #if BATCH_AXIS == 2 ivec4 tensor_idx = pos.xyzz; #endif #if BATCH_AXIS == 1 ivec4 tensor_idx = pos.xyzy; #endif #if BATCH_AXIS == 0 ivec4 tensor_idx = pos.xyzx; #endif // Adjust the axis that the batch dim runs along tensor_idx[3] /= sizes[BATCH_AXIS]; tensor_idx[BATCH_AXIS] %= sizes[BATCH_AXIS]; return tensor_idx; } /* * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim * is packed along a texel * Returns: the (x, y, z) texture position containing element of the tensor at * the specified index */ ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) { // Align packed dim to next multiple of 4 to account for texel padding sizes[packed_dim] = alignup4(sizes[packed_dim]); ivec3 pos = idx.xyz; pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS]; pos[packed_dim] /= 4; return pos; } /* * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim * is packed along a texel * Returns: the (x, y, z, i) texture position containing the element of the * tensor at the specified index, where i is the component within the * texel to which the element belongs */ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) { // Align packed dim to next multiple of 4 to account for texel padding sizes[packed_dim] = alignup4(sizes[packed_dim]); // pos[4] is set to a placeholder value ivec4 pos = idx.xyzx; pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS]; pos[packed_dim] /= 4; pos.w = idx[packed_dim] % 4; return pos; } // // Miscellaneous Utility Functions and Macros // // Given a buffer(1-D) index cur, compute a new index where the corresponding // tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane // describe sizes. As an example, let's say we want to swap dimensions 0,1 for a // tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and // plane=2*24=48. #define swap_adj_dims(cur, x, y, plane) \ cur + \ plane * \ ((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \ (x - 1) * ((cur % (y * plane)) / plane)) // Return the x, y, z and index value the channel-packed 3D tensor from the {n, // c, h, w}-index. ivec4 get_channel_packed_pos_from_index(ivec4 nchw, ivec4 sizes) { int aligned_c = alignup4(sizes.y); int c_stride = aligned_c / 4; return ivec4(nchw.w, nchw.z, nchw.x * c_stride + nchw.y / 4, nchw.y % 4); } #endif // INDEXING_UTILS_H