1 /*
2 * Minimum CUDA compatibility definitions header
3 *
4 * Copyright (c) 2019 rcombs
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #ifndef COMPAT_CUDA_CUDA_RUNTIME_H
24 #define COMPAT_CUDA_CUDA_RUNTIME_H
25
26 // Common macros
27 #define __global__ __attribute__((global))
28 #define __device__ __attribute__((device))
29 #define __device_builtin__ __attribute__((device_builtin))
30 #define __align__(N) __attribute__((aligned(N)))
31 #define __inline__ __inline__ __attribute__((always_inline))
32
33 #define max(a, b) ((a) > (b) ? (a) : (b))
34 #define min(a, b) ((a) < (b) ? (a) : (b))
35 #define abs(x) ((x) < 0 ? -(x) : (x))
36
37 #define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST))
38
39 // Basic typedefs
40 typedef __device_builtin__ unsigned long long cudaTextureObject_t;
41
42 typedef struct __device_builtin__ __align__(2) uchar2
43 {
44 unsigned char x, y;
45 } uchar2;
46
47 typedef struct __device_builtin__ __align__(4) ushort2
48 {
49 unsigned short x, y;
50 } ushort2;
51
52 typedef struct __device_builtin__ __align__(8) float2
53 {
54 float x, y;
55 } float2;
56
57 typedef struct __device_builtin__ __align__(8) int2
58 {
59 int x, y;
60 } int2;
61
62 typedef struct __device_builtin__ uint3
63 {
64 unsigned int x, y, z;
65 } uint3;
66
67 typedef struct uint3 dim3;
68
69 typedef struct __device_builtin__ __align__(4) uchar4
70 {
71 unsigned char x, y, z, w;
72 } uchar4;
73
74 typedef struct __device_builtin__ __align__(8) ushort4
75 {
76 unsigned short x, y, z, w;
77 } ushort4;
78
79 typedef struct __device_builtin__ __align__(16) int4
80 {
81 int x, y, z, w;
82 } int4;
83
84 typedef struct __device_builtin__ __align__(16) float4
85 {
86 float x, y, z, w;
87 } float4;
88
89 // Accessors for special registers
90 #define GETCOMP(reg, comp) \
91 asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
92 ret.comp = tmp;
93
94 #define GET(name, reg) static inline __device__ uint3 name() {\
95 uint3 ret; \
96 unsigned tmp; \
97 GETCOMP(reg, x) \
98 GETCOMP(reg, y) \
99 GETCOMP(reg, z) \
100 return ret; \
101 }
102
GET(getBlockIdx,ctaid)103 GET(getBlockIdx, ctaid)
104 GET(getBlockDim, ntid)
105 GET(getThreadIdx, tid)
106
107 // Instead of externs for these registers, we turn access to them into calls into trivial ASM
108 #define blockIdx (getBlockIdx())
109 #define blockDim (getBlockDim())
110 #define threadIdx (getThreadIdx())
111
112 // Basic initializers (simple macros rather than inline functions)
113 #define make_int2(a, b) ((int2){.x = a, .y = b})
114 #define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
115 #define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
116 #define make_float2(a, b) ((float2){.x = a, .y = b})
117 #define make_int4(a, b, c, d) ((int4){.x = a, .y = b, .z = c, .w = d})
118 #define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
119 #define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
120 #define make_float4(a, b, c, d) ((float4){.x = a, .y = b, .z = c, .w = d})
121
122 // Conversions from the tex instruction's 4-register output to various types
123 #define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}
124
125 TEX2D(unsigned char, a & 0xFF)
126 TEX2D(unsigned short, a & 0xFFFF)
127 TEX2D(float, a)
128 TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))
129 TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))
130 TEX2D(float2, make_float2(a, b))
131 TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))
132 TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))
133 TEX2D(float4, make_float4(a, b, c, d))
134
135 // Template calling tex instruction and converting the output to the selected type
136 template<typename T>
137 inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
138 {
139 T ret;
140 unsigned ret1, ret2, ret3, ret4;
141 asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
142 "=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) :
143 "l"(texObject), "f"(x), "f"(y));
144 conv(&ret, ret1, ret2, ret3, ret4);
145 return ret;
146 }
147
148 template<>
149 inline __device__ float4 tex2D<float4>(cudaTextureObject_t texObject, float x, float y)
150 {
151 float4 ret;
152 asm("tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
153 "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) :
154 "l"(texObject), "f"(x), "f"(y));
155 return ret;
156 }
157
158 template<>
159 inline __device__ float tex2D<float>(cudaTextureObject_t texObject, float x, float y)
160 {
161 return tex2D<float4>(texObject, x, y).x;
162 }
163
164 template<>
165 inline __device__ float2 tex2D<float2>(cudaTextureObject_t texObject, float x, float y)
166 {
167 float4 ret = tex2D<float4>(texObject, x, y);
168 return make_float2(ret.x, ret.y);
169 }
170
171 // Math helper functions
floorf(float a)172 static inline __device__ float floorf(float a) { return __builtin_floorf(a); }
floor(float a)173 static inline __device__ float floor(float a) { return __builtin_floorf(a); }
floor(double a)174 static inline __device__ double floor(double a) { return __builtin_floor(a); }
ceilf(float a)175 static inline __device__ float ceilf(float a) { return __builtin_ceilf(a); }
ceil(float a)176 static inline __device__ float ceil(float a) { return __builtin_ceilf(a); }
ceil(double a)177 static inline __device__ double ceil(double a) { return __builtin_ceil(a); }
truncf(float a)178 static inline __device__ float truncf(float a) { return __builtin_truncf(a); }
trunc(float a)179 static inline __device__ float trunc(float a) { return __builtin_truncf(a); }
trunc(double a)180 static inline __device__ double trunc(double a) { return __builtin_trunc(a); }
fabsf(float a)181 static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
fabs(float a)182 static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
fabs(double a)183 static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
184
__sinf(float a)185 static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
__cosf(float a)186 static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }
187
188 #endif /* COMPAT_CUDA_CUDA_RUNTIME_H */
189