1 // 2 // Copyright 2021 The ANGLE Project Authors. All rights reserved. 3 // Use of this source code is governed by a BSD-style license that can be 4 // found in the LICENSE file. 5 // 6 // cl_types.h: Defines common types for the OpenCL support in ANGLE. 7 8 #ifndef LIBANGLE_CLTYPES_H_ 9 #define LIBANGLE_CLTYPES_H_ 10 11 #if defined(ANGLE_ENABLE_CL) 12 # include "libANGLE/CLBitField.h" 13 # include "libANGLE/CLRefPointer.h" 14 # include "libANGLE/Debug.h" 15 # include "libANGLE/angletypes.h" 16 17 # include "common/PackedCLEnums_autogen.h" 18 # include "common/angleutils.h" 19 20 // Include frequently used standard headers 21 # include <algorithm> 22 # include <array> 23 # include <functional> 24 # include <list> 25 # include <memory> 26 # include <string> 27 # include <utility> 28 # include <vector> 29 30 namespace cl 31 { 32 33 class Buffer; 34 class CommandQueue; 35 class Context; 36 class Device; 37 class Event; 38 class Image; 39 class Kernel; 40 class Memory; 41 class Object; 42 class Platform; 43 class Program; 44 class Sampler; 45 46 using BufferPtr = RefPointer<Buffer>; 47 using CommandQueuePtr = RefPointer<CommandQueue>; 48 using ContextPtr = RefPointer<Context>; 49 using DevicePtr = RefPointer<Device>; 50 using EventPtr = RefPointer<Event>; 51 using KernelPtr = RefPointer<Kernel>; 52 using MemoryPtr = RefPointer<Memory>; 53 using PlatformPtr = RefPointer<Platform>; 54 using ProgramPtr = RefPointer<Program>; 55 using SamplerPtr = RefPointer<Sampler>; 56 57 using BufferPtrs = std::vector<BufferPtr>; 58 using DevicePtrs = std::vector<DevicePtr>; 59 using EventPtrs = std::vector<EventPtr>; 60 using KernelPtrs = std::vector<KernelPtr>; 61 using MemoryPtrs = std::vector<MemoryPtr>; 62 using PlatformPtrs = std::vector<PlatformPtr>; 63 using ProgramPtrs = std::vector<ProgramPtr>; 64 using SamplerPtrs = std::vector<SamplerPtr>; 65 66 using WorkgroupSize = std::array<uint32_t, 3>; 67 using GlobalWorkOffset = std::array<uint32_t, 3>; 68 using GlobalWorkSize = std::array<uint32_t, 3>; 69 using WorkgroupCount = std::array<uint32_t, 3>; 70 71 template <typename T> 72 using EventStatusMap = std::array<T, 3>; 73 74 using Extents = angle::Extents<size_t>; 75 using Offset = angle::Offset<size_t>; 76 constexpr Offset kOffsetZero(0, 0, 0); 77 78 struct KernelArg 79 { 80 bool isSet; 81 cl_uint index; 82 size_t size; 83 const void *valuePtr; 84 }; 85 86 struct BufferRect 87 { 88 BufferRect(const Offset &offset, 89 const Extents &size, 90 const size_t row_pitch, 91 const size_t slice_pitch, 92 const size_t element_size = 1) mOriginBufferRect93 : mOrigin(offset), 94 mSize(size), 95 mRowPitch(row_pitch == 0 ? element_size * size.width : row_pitch), 96 mSlicePitch(slice_pitch == 0 ? mRowPitch * size.height : slice_pitch), 97 mElementSize(element_size) 98 {} validBufferRect99 bool valid() const 100 { 101 return mSize.width != 0 && mSize.height != 0 && mSize.depth != 0 && 102 mRowPitch >= mSize.width * mElementSize && mSlicePitch >= mRowPitch * mSize.height && 103 mElementSize > 0; 104 } 105 bool operator==(const BufferRect &other) const 106 { 107 return (mOrigin == other.mOrigin && mSize == other.mSize && mRowPitch == other.mRowPitch && 108 mSlicePitch == other.mSlicePitch && mElementSize == other.mElementSize); 109 } 110 bool operator!=(const BufferRect &other) const { return !(*this == other); } 111 getRowOffsetBufferRect112 size_t getRowOffset(size_t slice, size_t row) const 113 { 114 return ((mRowPitch * (mOrigin.y + row)) + (mOrigin.x * mElementSize)) + // row offset 115 (mSlicePitch * (mOrigin.z + slice)); // slice offset 116 } 117 getRowPitchBufferRect118 size_t getRowPitch() { return mRowPitch; } getSlicePitchBufferRect119 size_t getSlicePitch() { return mSlicePitch; } 120 Offset mOrigin; 121 Extents mSize; 122 size_t mRowPitch; 123 size_t mSlicePitch; 124 size_t mElementSize; 125 }; 126 127 struct ImageDescriptor 128 { 129 MemObjectType type; 130 size_t width; 131 size_t height; 132 size_t depth; 133 size_t arraySize; 134 size_t rowPitch; 135 size_t slicePitch; 136 cl_uint numMipLevels; 137 cl_uint numSamples; 138 ImageDescriptorImageDescriptor139 ImageDescriptor(MemObjectType type_, 140 size_t width_, 141 size_t height_, 142 size_t depth_, 143 size_t arraySize_, 144 size_t rowPitch_, 145 size_t slicePitch_, 146 cl_uint numMipLevels_, 147 cl_uint numSamples_) 148 : type(type_), 149 width(width_), 150 height(height_), 151 depth(depth_), 152 arraySize(arraySize_), 153 rowPitch(rowPitch_), 154 slicePitch(slicePitch_), 155 numMipLevels(numMipLevels_), 156 numSamples(numSamples_) 157 { 158 if (type == MemObjectType::Image1D || type == MemObjectType::Image1D_Array || 159 type == MemObjectType::Image1D_Buffer) 160 { 161 depth = 1; 162 height = 1; 163 } 164 if (type == MemObjectType::Image2D || type == MemObjectType::Image2D_Array) 165 { 166 depth = 1; 167 } 168 if (!(type == MemObjectType::Image1D_Array || type == MemObjectType::Image2D_Array)) 169 { 170 arraySize = 1; 171 } 172 } 173 }; 174 175 struct MemOffsets 176 { 177 size_t x, y, z; 178 }; 179 constexpr MemOffsets kMemOffsetsZero{0, 0, 0}; 180 181 struct Coordinate 182 { 183 size_t x, y, z; 184 }; 185 constexpr Coordinate kCoordinateZero{0, 0, 0}; 186 187 struct NDRange 188 { NDRangeNDRange189 NDRange(cl_uint workDimensionsIn, 190 const size_t *globalWorkOffsetIn, 191 const size_t *globalWorkSizeIn, 192 const size_t *localWorkSizeIn) 193 : workDimensions(workDimensionsIn), 194 globalWorkOffset({0, 0, 0}), 195 globalWorkSize({1, 1, 1}), 196 localWorkSize({1, 1, 1}), 197 nullLocalWorkSize(localWorkSizeIn == nullptr) 198 { 199 for (cl_uint dim = 0; dim < workDimensionsIn; dim++) 200 { 201 if (globalWorkOffsetIn != nullptr) 202 { 203 ASSERT(!(static_cast<uint32_t>((globalWorkOffsetIn[dim] + globalWorkSizeIn[dim])) < 204 globalWorkOffsetIn[dim])); 205 globalWorkOffset[dim] = static_cast<uint32_t>(globalWorkOffsetIn[dim]); 206 } 207 if (globalWorkSizeIn != nullptr) 208 { 209 ASSERT(globalWorkSizeIn[dim] <= UINT32_MAX); 210 globalWorkSize[dim] = static_cast<uint32_t>(globalWorkSizeIn[dim]); 211 } 212 if (localWorkSizeIn != nullptr) 213 { 214 ASSERT(localWorkSizeIn[dim] <= UINT32_MAX); 215 localWorkSize[dim] = static_cast<uint32_t>(localWorkSizeIn[dim]); 216 } 217 } 218 } 219 getWorkgroupCountNDRange220 cl::WorkgroupCount getWorkgroupCount() const 221 { 222 ASSERT(localWorkSize[0] > 0 && localWorkSize[1] > 0 && localWorkSize[2] > 0); 223 return cl::WorkgroupCount{rx::UnsignedCeilDivide(globalWorkSize[0], localWorkSize[0]), 224 rx::UnsignedCeilDivide(globalWorkSize[1], localWorkSize[1]), 225 rx::UnsignedCeilDivide(globalWorkSize[2], localWorkSize[2])}; 226 } 227 isUniformNDRange228 bool isUniform() const 229 { 230 for (cl_uint dim = 0; dim < workDimensions; dim++) 231 { 232 if (globalWorkSize[dim] % localWorkSize[dim] != 0) 233 { 234 return false; 235 } 236 } 237 return true; 238 } 239 createUniformRegionsNDRange240 std::vector<NDRange> createUniformRegions( 241 const std::array<uint32_t, 3> maxComputeWorkGroupCount) const 242 { 243 std::vector<NDRange> regions; 244 regions.push_back(*this); 245 regions.front().globalWorkOffset = {0}; 246 uint32_t regionCount = 1; 247 for (uint32_t regionPos = 0; regionPos < regionCount; ++regionPos) 248 { 249 // "Work-group sizes could be non-uniform in multiple dimensions, potentially producing 250 // work-groups of up to 4 different sizes in a 2D range and 8 different sizes in a 3D 251 // range." 252 // https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_mapping_work_items_onto_an_nd_range 253 ASSERT(regionPos < 8); 254 255 for (uint32_t dim = 0; dim < workDimensions; dim++) 256 { 257 NDRange ®ion = regions.at(regionPos); 258 uint32_t remainder = region.globalWorkSize[dim] % region.localWorkSize[dim]; 259 if (remainder != 0) 260 { 261 // Split the range along this dimension. The original range's global work size 262 // (e.g. 19) is clipped to a multiple of the local work size (e.g. 8). A new 263 // range is added for the remainder (in this example 3) where the global and 264 // local work sizes are identical to the remainder (i.e. it's also a uniform 265 // range). 266 NDRange newRegion(region); 267 newRegion.globalWorkSize[dim] = newRegion.localWorkSize[dim] = remainder; 268 region.globalWorkSize[dim] = newRegion.globalWorkOffset[dim] = 269 (region.globalWorkSize[dim] - remainder); 270 regions.push_back(newRegion); 271 regionCount++; 272 } 273 } 274 } 275 // Break into uniform regions that fit into given maxComputeWorkGroupCount (if needed) 276 uint32_t limitRegionCount = 1; 277 std::vector<NDRange> regionsWithinDeviceLimits; 278 for (const auto ®ion : regions) 279 { 280 regionsWithinDeviceLimits.push_back(region); 281 for (uint32_t regionPos = 0; regionPos < limitRegionCount; ++regionPos) 282 { 283 NDRange ¤tRegion = regionsWithinDeviceLimits.at(regionPos); 284 for (uint32_t dim = 0; dim < workDimensions; dim++) 285 { 286 uint32_t maxGwsForRegion = gl::clampCast<uint32_t, uint64_t>( 287 static_cast<uint64_t>(maxComputeWorkGroupCount[dim]) * 288 static_cast<uint64_t>(currentRegion.localWorkSize[dim])); 289 290 if (currentRegion.globalWorkSize[dim] > maxGwsForRegion) 291 { 292 uint32_t remainderGws = currentRegion.globalWorkSize[dim] - maxGwsForRegion; 293 if (remainderGws > 0) 294 { 295 NDRange remainderRegion = currentRegion; 296 remainderRegion.globalWorkSize[dim] = remainderGws; 297 remainderRegion.globalWorkOffset[dim] = 298 currentRegion.globalWorkOffset[dim] + 299 (currentRegion.globalWorkSize[dim] - remainderGws); 300 currentRegion.globalWorkSize[dim] = maxGwsForRegion; 301 regionsWithinDeviceLimits.push_back(remainderRegion); 302 limitRegionCount++; 303 } 304 } 305 } 306 } 307 } 308 return regionsWithinDeviceLimits; 309 } 310 311 cl_uint workDimensions; 312 GlobalWorkOffset globalWorkOffset; 313 GlobalWorkSize globalWorkSize; 314 WorkgroupSize localWorkSize; 315 bool nullLocalWorkSize{false}; 316 }; 317 318 } // namespace cl 319 320 #endif // ANGLE_ENABLE_CL 321 322 #endif // LIBANGLE_CLTYPES_H_ 323