1 /* GStreamer
2 * Copyright (C) 2010 David Schleef <ds@schleef.org>
3 * Copyright (C) 2010 Sebastian Dröge <sebastian.droege@collabora.co.uk>
4 * Copyright (C) 2019 Seungha Yang <seungha.yang@navercorp.com>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 */
21
22 /**
23 * SECTION:cudaconverter
24 * @title: GstCudaConverter
25 * @short_description: Generic video conversion using CUDA
26 *
27 * This object is used to convert video frames from one format to another.
28 * The object can perform conversion of:
29 *
30 * * video format
31 * * video colorspace
32 * * video size
33 */
34
35 /**
36 * TODO:
37 * * Add more interpolation method and make it selectable,
38 * currently default bi-linear interpolation only
39 * * Add fast-path for conversion like videoconvert
40 * * Full colorimetry and chroma-siting support
41 * * cropping, and x, y position support
42 */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #include "cuda-converter.h"
49 #include "gstcudautils.h"
50 #include "gstcudaloader.h"
51 #include "gstcudanvrtc.h"
52 #include <string.h>
53
54 #define CUDA_BLOCK_X 16
55 #define CUDA_BLOCK_Y 16
56 #define DIV_UP(size,block) (((size) + ((block) - 1)) / (block))
57
58 static gboolean cuda_converter_lookup_path (GstCudaConverter * convert);
59
60 #ifndef GST_DISABLE_GST_DEBUG
61 #define GST_CAT_DEFAULT ensure_debug_category()
62 static GstDebugCategory *
ensure_debug_category(void)63 ensure_debug_category (void)
64 {
65 static gsize cat_gonce = 0;
66
67 if (g_once_init_enter (&cat_gonce)) {
68 gsize cat_done;
69
70 cat_done = (gsize) _gst_debug_category_new ("cuda-converter", 0,
71 "cuda-converter object");
72
73 g_once_init_leave (&cat_gonce, cat_done);
74 }
75
76 return (GstDebugCategory *) cat_gonce;
77 }
78 #else
79 #define ensure_debug_category()
80 #endif
81
82 #define GST_CUDA_KERNEL_FUNC "gst_cuda_kernel_func"
83
84 #define GST_CUDA_KERNEL_FUNC_TO_Y444 "gst_cuda_kernel_func_to_y444"
85
86 #define GST_CUDA_KERNEL_FUNC_Y444_TO_YUV "gst_cuda_kernel_func_y444_to_yuv"
87
88 #define GST_CUDA_KERNEL_FUNC_TO_ARGB "gst_cuda_kernel_func_to_argb"
89
90 #define GST_CUDA_KERNEL_FUNC_SCALE_RGB "gst_cuda_kernel_func_scale_rgb"
91
92 /* *INDENT-OFF* */
93 /**
94 * read_chroma:
95 * @tex1: a CUDA texture object representing a semi-planar chroma plane
96 * @tex2: dummy object
97 * @x: the x coordinate to read data from @tex1
98 * @y: the y coordinate to read data from @tex1
99 *
100 * Returns: a #ushort2 vector representing both chroma pixel values
101 */
102 static const gchar READ_CHROMA_FROM_SEMI_PLANAR[] =
103 "__device__ ushort2\n"
104 "read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n"
105 " float x, float y)\n"
106 "{\n"
107 " return tex2D<ushort2>(tex1, x, y);\n"
108 "}";
109
110 /**
111 * read_chroma:
112 * @tex1: a CUDA texture object representing a chroma planar plane
113 * @tex2: a CUDA texture object representing the other planar plane
114 * @x: the x coordinate to read data from @tex1 and @tex2
115 * @y: the y coordinate to read data from @tex1 and @tex2
116 *
117 * Returns: a #ushort2 vector representing both chroma pixel values
118 */
119 static const gchar READ_CHROMA_FROM_PLANAR[] =
120 "__device__ ushort2\n"
121 "read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n"
122 " float x, float y)\n"
123 "{\n"
124 " unsigned short u, v;\n"
125 " u = tex2D<unsigned short>(tex1, x, y);\n"
126 " v = tex2D<unsigned short>(tex2, x, y);\n"
127 " return make_ushort2(u, v);\n"
128 "}";
129
130 /**
131 * write_chroma:
132 * @dst1: a CUDA global memory pointing to a semi-planar chroma plane
133 * @dst2: dummy
134 * @u: a pixel value to write @dst1
135 * @v: a pixel value to write @dst1
136 * @x: the x coordinate to write data into @tex1
137 * @x: the y coordinate to write data into @tex1
138 * @pstride: the pixel stride of @dst1
139 * @mask: bitmask to be applied to high bitdepth plane
140 *
141 * Write @u and @v pixel value to @dst1 semi-planar plane
142 */
143 static const gchar WRITE_CHROMA_TO_SEMI_PLANAR[] =
144 "__device__ void\n"
145 "write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n"
146 " unsigned short v, int x, int y, int pstride, int stride, int mask)\n"
147 "{\n"
148 " if (OUT_DEPTH > 8) {\n"
149 " *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n"
150 " *(unsigned short *)&dst1[x * pstride + 2 + y * stride] = (v & mask);\n"
151 " } else {\n"
152 " dst1[x * pstride + y * stride] = u;\n"
153 " dst1[x * pstride + 1 + y * stride] = v;\n"
154 " }\n"
155 "}";
156
157 /**
158 * write_chroma:
159 * @dst1: a CUDA global memory pointing to a planar chroma plane
160 * @dst2: a CUDA global memory pointing to a the other planar chroma plane
161 * @u: a pixel value to write @dst1
162 * @v: a pixel value to write @dst1
163 * @x: the x coordinate to write data into @tex1
164 * @x: the y coordinate to write data into @tex1
165 * @pstride: the pixel stride of @dst1
166 * @mask: bitmask to be applied to high bitdepth plane
167 *
168 * Write @u and @v pixel value into @dst1 and @dst2 planar planes
169 */
170 static const gchar WRITE_CHROMA_TO_PLANAR[] =
171 "__device__ void\n"
172 "write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n"
173 " unsigned short v, int x, int y, int pstride, int stride, int mask)\n"
174 "{\n"
175 " if (OUT_DEPTH > 8) {\n"
176 " *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n"
177 " *(unsigned short *)&dst2[x * pstride + y * stride] = (v & mask);\n"
178 " } else {\n"
179 " dst1[x * pstride + y * stride] = u;\n"
180 " dst2[x * pstride + y * stride] = v;\n"
181 " }\n"
182 "}";
183
184 /* CUDA kernel source for from YUV to YUV conversion and scale */
185 static const gchar templ_YUV_TO_YUV[] =
186 "extern \"C\"{\n"
187 "__constant__ float SCALE_H = %s;\n"
188 "__constant__ float SCALE_V = %s;\n"
189 "__constant__ float CHROMA_SCALE_H = %s;\n"
190 "__constant__ float CHROMA_SCALE_V = %s;\n"
191 "__constant__ int WIDTH = %d;\n"
192 "__constant__ int HEIGHT = %d;\n"
193 "__constant__ int CHROMA_WIDTH = %d;\n"
194 "__constant__ int CHROMA_HEIGHT = %d;\n"
195 "__constant__ int IN_DEPTH = %d;\n"
196 "__constant__ int OUT_DEPTH = %d;\n"
197 "__constant__ int PSTRIDE = %d;\n"
198 "__constant__ int CHROMA_PSTRIDE = %d;\n"
199 "__constant__ int IN_SHIFT = %d;\n"
200 "__constant__ int OUT_SHIFT = %d;\n"
201 "__constant__ int MASK = %d;\n"
202 "__constant__ int SWAP_UV = %d;\n"
203 "\n"
204 "__device__ unsigned short\n"
205 "do_scale_pixel (unsigned short val) \n"
206 "{\n"
207 " unsigned int diff;\n"
208 " if (OUT_DEPTH > IN_DEPTH) {\n"
209 " diff = OUT_DEPTH - IN_DEPTH;\n"
210 " return (val << diff) | (val >> (IN_DEPTH - diff));\n"
211 " } else if (IN_DEPTH > OUT_DEPTH) {\n"
212 " return val >> (IN_DEPTH - OUT_DEPTH);\n"
213 " }\n"
214 " return val;\n"
215 "}\n"
216 "\n"
217 /* __device__ ushort2
218 * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
219 */
220 "%s\n"
221 "\n"
222 /* __device__ void
223 * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,
224 * unsigned short v, int x, int y, int pstride, int stride, int mask);
225 */
226 "%s\n"
227 "\n"
228 "__global__ void\n"
229 GST_CUDA_KERNEL_FUNC
230 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
231 " unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n"
232 " int stride)\n"
233 "{\n"
234 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
235 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
236 " if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
237 " float src_xpos = SCALE_H * x_pos;\n"
238 " float src_ypos = SCALE_V * y_pos;\n"
239 " unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
240 " y = y >> IN_SHIFT;\n"
241 " y = do_scale_pixel (y);\n"
242 " y = y << OUT_SHIFT;\n"
243 " if (OUT_DEPTH > 8) {\n"
244 " *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n"
245 " } else {\n"
246 " dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n"
247 " }\n"
248 " }\n"
249 " if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n"
250 " float src_xpos = CHROMA_SCALE_H * x_pos;\n"
251 " float src_ypos = CHROMA_SCALE_V * y_pos;\n"
252 " unsigned short u, v;\n"
253 " ushort2 uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
254 " u = uv.x;\n"
255 " v = uv.y;\n"
256 " u = u >> IN_SHIFT;\n"
257 " v = v >> IN_SHIFT;\n"
258 " u = do_scale_pixel (u);\n"
259 " v = do_scale_pixel (v);\n"
260 " u = u << OUT_SHIFT;\n"
261 " v = v << OUT_SHIFT;\n"
262 " if (SWAP_UV) {\n"
263 " unsigned short tmp = u;\n"
264 " u = v;\n"
265 " v = tmp;\n"
266 " }\n"
267 " write_chroma (dst1,\n"
268 " dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, stride, MASK);\n"
269 " }\n"
270 "}\n"
271 "\n"
272 "}";
273
274 /* CUDA kernel source for from YUV to RGB conversion and scale */
275 static const gchar templ_YUV_TO_RGB[] =
276 "extern \"C\"{\n"
277 "__constant__ float offset[3] = {%s, %s, %s};\n"
278 "__constant__ float rcoeff[3] = {%s, %s, %s};\n"
279 "__constant__ float gcoeff[3] = {%s, %s, %s};\n"
280 "__constant__ float bcoeff[3] = {%s, %s, %s};\n"
281 "\n"
282 "__constant__ float SCALE_H = %s;\n"
283 "__constant__ float SCALE_V = %s;\n"
284 "__constant__ float CHROMA_SCALE_H = %s;\n"
285 "__constant__ float CHROMA_SCALE_V = %s;\n"
286 "__constant__ int WIDTH = %d;\n"
287 "__constant__ int HEIGHT = %d;\n"
288 "__constant__ int CHROMA_WIDTH = %d;\n"
289 "__constant__ int CHROMA_HEIGHT = %d;\n"
290 "__constant__ int IN_DEPTH = %d;\n"
291 "__constant__ int OUT_DEPTH = %d;\n"
292 "__constant__ int PSTRIDE = %d;\n"
293 "__constant__ int CHROMA_PSTRIDE = %d;\n"
294 "__constant__ int IN_SHIFT = %d;\n"
295 "__constant__ int OUT_SHIFT = %d;\n"
296 "__constant__ int MASK = %d;\n"
297 "__constant__ int SWAP_UV = %d;\n"
298 "__constant__ int MAX_IN_VAL = %d;\n"
299 "__constant__ int R_IDX = %d;\n"
300 "__constant__ int G_IDX = %d;\n"
301 "__constant__ int B_IDX = %d;\n"
302 "__constant__ int A_IDX = %d;\n"
303 "__constant__ int X_IDX = %d;\n"
304 "\n"
305 "__device__ unsigned short\n"
306 "do_scale_pixel (unsigned short val) \n"
307 "{\n"
308 " unsigned int diff;\n"
309 " if (OUT_DEPTH > IN_DEPTH) {\n"
310 " diff = OUT_DEPTH - IN_DEPTH;\n"
311 " return (val << diff) | (val >> (IN_DEPTH - diff));\n"
312 " } else if (IN_DEPTH > OUT_DEPTH) {\n"
313 " return val >> (IN_DEPTH - OUT_DEPTH);\n"
314 " }\n"
315 " return val;\n"
316 "}\n"
317 "\n"
318 "__device__ float\n"
319 "dot(float3 val, float *coeff)\n"
320 "{\n"
321 " return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n"
322 "}\n"
323 "\n"
324 "__device__ uint3\n"
325 "yuv_to_rgb (unsigned short y, unsigned short u, unsigned short v, unsigned int max_val)\n"
326 "{\n"
327 " float3 yuv = make_float3 (y, u, v);\n"
328 " uint3 rgb;\n"
329 " rgb.x = max ((unsigned int)(dot (yuv, rcoeff) + offset[0]), 0);\n"
330 " rgb.y = max ((unsigned int)(dot (yuv, gcoeff) + offset[1]), 0);\n"
331 " rgb.z = max ((unsigned int)(dot (yuv, bcoeff) + offset[2]), 0);\n"
332 " rgb.x = min (rgb.x, max_val);\n"
333 " rgb.y = min (rgb.y, max_val);\n"
334 " rgb.z = min (rgb.z, max_val);\n"
335 " return rgb;\n"
336 "}\n"
337 "\n"
338 /* __device__ ushort2
339 * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
340 */
341 "%s\n"
342 "\n"
343 "__global__ void\n"
344 GST_CUDA_KERNEL_FUNC
345 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
346 " unsigned char *dstRGB, int stride)\n"
347 "{\n"
348 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
349 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
350 " if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
351 " float src_xpos = SCALE_H * x_pos;\n"
352 " float src_ypos = SCALE_V * y_pos;\n"
353 " unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
354 " ushort2 uv;\n"
355 " unsigned short u, v;\n"
356 " uint3 rgb;\n"
357 " unsigned int clip_max = MAX_IN_VAL;\n"
358 " src_xpos = CHROMA_SCALE_H * x_pos;\n"
359 " src_ypos = CHROMA_SCALE_V * y_pos;\n"
360 " uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
361 " u = uv.x;\n"
362 " v = uv.y;\n"
363 " y = y >> IN_SHIFT;\n"
364 " u = u >> IN_SHIFT;\n"
365 " v = v >> IN_SHIFT;\n"
366 " if (SWAP_UV) {\n"
367 " unsigned short tmp = u;\n"
368 " u = v;\n"
369 " v = tmp;\n"
370 " }\n"
371 /* conversion matrix is scaled to higher bitdepth between in/out formats */
372 " if (OUT_DEPTH > IN_DEPTH) {\n"
373 " y = do_scale_pixel (y);\n"
374 " u = do_scale_pixel (u);\n"
375 " v = do_scale_pixel (v);\n"
376 " clip_max = MASK;\n"
377 " }"
378 " rgb = yuv_to_rgb (y, u, v, clip_max);\n"
379 " if (OUT_DEPTH < IN_DEPTH) {\n"
380 " rgb.x = do_scale_pixel (rgb.x);\n"
381 " rgb.y = do_scale_pixel (rgb.y);\n"
382 " rgb.z = do_scale_pixel (rgb.z);\n"
383 " }"
384 " if (OUT_DEPTH > 8) {\n"
385 " unsigned int packed_rgb = 0;\n"
386 /* A is always MSB, we support only little endian system */
387 " packed_rgb = 0xc000 << 16;\n"
388 " packed_rgb |= (rgb.x << (30 - (R_IDX * 10)));\n"
389 " packed_rgb |= (rgb.y << (30 - (G_IDX * 10)));\n"
390 " packed_rgb |= (rgb.z << (30 - (B_IDX * 10)));\n"
391 " *(unsigned int *)&dstRGB[x_pos * PSTRIDE + y_pos * stride] = packed_rgb;\n"
392 " } else {\n"
393 " dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * stride] = (unsigned char) rgb.x;\n"
394 " dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * stride] = (unsigned char) rgb.y;\n"
395 " dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * stride] = (unsigned char) rgb.z;\n"
396 " if (A_IDX >= 0 || X_IDX >= 0)\n"
397 " dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * stride] = 0xff;\n"
398 " }\n"
399 " }\n"
400 "}\n"
401 "\n"
402 "}";
403
404 /**
405 * GST_CUDA_KERNEL_FUNC_TO_ARGB:
406 * @srcRGB: a CUDA global memory containing a RGB image
407 * @dstRGB: a CUDA global memory to store unpacked ARGB image
408 * @width: the width of @srcRGB and @dstRGB
409 * @height: the height of @srcRGB and @dstRGB
410 * @src_stride: the stride of @srcRGB
411 * @src_pstride: the pixel stride of @srcRGB
412 * @dst_stride: the stride of @dstRGB
413 * @r_idx: the index of red component of @srcRGB
414 * @g_idx: the index of green component of @srcRGB
415 * @b_idx: the index of blue component of @srcRGB
416 * @a_idx: the index of alpha component of @srcRGB
417 *
418 * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB
419 */
420 static const gchar unpack_to_ARGB[] =
421 "__global__ void\n"
422 GST_CUDA_KERNEL_FUNC_TO_ARGB
423 "(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n"
424 " int src_stride, int src_pstride, int dst_stride,\n"
425 " int r_idx, int g_idx, int b_idx, int a_idx)\n"
426 "{\n"
427 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
428 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
429 " if (x_pos < width && y_pos < height) {\n"
430 " if (a_idx >= 0) {\n"
431 " dstRGB[x_pos * 4 + y_pos * dst_stride] =\n"
432 " srcRGB[x_pos * src_pstride + a_idx + y_pos * src_stride];\n"
433 " } else {\n"
434 " dstRGB[x_pos * 4 + y_pos * dst_stride] = 0xff;\n"
435 " }\n"
436 " dstRGB[x_pos * 4 + 1 + y_pos * dst_stride] =\n"
437 " srcRGB[x_pos * src_pstride + r_idx + y_pos * src_stride];\n"
438 " dstRGB[x_pos * 4 + 2 + y_pos * dst_stride] =\n"
439 " srcRGB[x_pos * src_pstride + g_idx + y_pos * src_stride];\n"
440 " dstRGB[x_pos * 4 + 3 + y_pos * dst_stride] =\n"
441 " srcRGB[x_pos * src_pstride + b_idx + y_pos * src_stride];\n"
442 " }\n"
443 "}\n";
444
445 /**
446 * GST_CUDA_KERNEL_FUNC_TO_ARGB:
447 * @srcRGB: a CUDA global memory containing a RGB image
448 * @dstRGB: a CUDA global memory to store unpacked ARGB64 image
449 * @width: the width of @srcRGB and @dstRGB
450 * @height: the height of @srcRGB and @dstRGB
451 * @src_stride: the stride of @srcRGB
452 * @src_pstride: the pixel stride of @srcRGB
453 * @dst_stride: the stride of @dstRGB
454 * @r_idx: the index of red component of @srcRGB
455 * @g_idx: the index of green component of @srcRGB
456 * @b_idx: the index of blue component of @srcRGB
457 * @a_idx: the index of alpha component of @srcRGB
458 *
459 * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB
460 */
461 static const gchar unpack_to_ARGB64[] =
462 "__global__ void\n"
463 GST_CUDA_KERNEL_FUNC_TO_ARGB
464 "(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n"
465 " int src_stride, int src_pstride, int dst_stride,\n"
466 " int r_idx, int g_idx, int b_idx, int a_idx)\n"
467 "{\n"
468 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
469 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
470 " if (x_pos < width && y_pos < height) {\n"
471 " unsigned short a, r, g, b;\n"
472 " unsigned int read_val;\n"
473 " read_val = *(unsigned int *)&srcRGB[x_pos * src_pstride + y_pos * src_stride];\n"
474 " a = (read_val >> 30) & 0x03;\n"
475 " a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
476 " r = ((read_val >> (30 - (r_idx * 10))) & 0x3ff);\n"
477 " r = (r << 6) | (r >> 4);\n"
478 " g = ((read_val >> (30 - (g_idx * 10))) & 0x3ff);\n"
479 " g = (g << 6) | (g >> 4);\n"
480 " b = ((read_val >> (30 - (b_idx * 10))) & 0x3ff);\n"
481 " b = (b << 6) | (b >> 4);\n"
482 " *(unsigned short *)&dstRGB[x_pos * 8 + y_pos * dst_stride] = 0xffff;\n"
483 " *(unsigned short *)&dstRGB[x_pos * 8 + 2 + y_pos * dst_stride] = r;\n"
484 " *(unsigned short *)&dstRGB[x_pos * 8 + 4 + y_pos * dst_stride] = g;\n"
485 " *(unsigned short *)&dstRGB[x_pos * 8 + 6 + y_pos * dst_stride] = b;\n"
486 " }\n"
487 "}\n";
488
489 /* CUDA kernel source for from RGB to YUV conversion and scale */
490 static const gchar templ_RGB_TO_YUV[] =
491 "extern \"C\"{\n"
492 "__constant__ float offset[3] = {%s, %s, %s};\n"
493 "__constant__ float ycoeff[3] = {%s, %s, %s};\n"
494 "__constant__ float ucoeff[3] = {%s, %s, %s};\n"
495 "__constant__ float vcoeff[3] = {%s, %s, %s};\n"
496 "\n"
497 "__constant__ float SCALE_H = %s;\n"
498 "__constant__ float SCALE_V = %s;\n"
499 "__constant__ float CHROMA_SCALE_H = %s;\n"
500 "__constant__ float CHROMA_SCALE_V = %s;\n"
501 "__constant__ int WIDTH = %d;\n"
502 "__constant__ int HEIGHT = %d;\n"
503 "__constant__ int CHROMA_WIDTH = %d;\n"
504 "__constant__ int CHROMA_HEIGHT = %d;\n"
505 "__constant__ int IN_DEPTH = %d;\n"
506 "__constant__ int OUT_DEPTH = %d;\n"
507 "__constant__ int PSTRIDE = %d;\n"
508 "__constant__ int CHROMA_PSTRIDE = %d;\n"
509 "__constant__ int IN_SHIFT = %d;\n"
510 "__constant__ int OUT_SHIFT = %d;\n"
511 "__constant__ int MASK = %d;\n"
512 "__constant__ int SWAP_UV = %d;\n"
513 "\n"
514 "__device__ unsigned short\n"
515 "do_scale_pixel (unsigned short val) \n"
516 "{\n"
517 " unsigned int diff;\n"
518 " if (OUT_DEPTH > IN_DEPTH) {\n"
519 " diff = OUT_DEPTH - IN_DEPTH;\n"
520 " return (val << diff) | (val >> (IN_DEPTH - diff));\n"
521 " } else if (IN_DEPTH > OUT_DEPTH) {\n"
522 " return val >> (IN_DEPTH - OUT_DEPTH);\n"
523 " }\n"
524 " return val;\n"
525 "}\n"
526 "\n"
527 "__device__ float\n"
528 "dot(float3 val, float *coeff)\n"
529 "{\n"
530 " return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n"
531 "}\n"
532 "\n"
533 "__device__ uint3\n"
534 "rgb_to_yuv (unsigned short r, unsigned short g, unsigned short b,\n"
535 " unsigned int max_val)\n"
536 "{\n"
537 " float3 rgb = make_float3 (r, g, b);\n"
538 " uint3 yuv;\n"
539 " yuv.x = max ((unsigned int)(dot (rgb, ycoeff) + offset[0]), 0);\n"
540 " yuv.y = max ((unsigned int)(dot (rgb, ucoeff) + offset[1]), 0);\n"
541 " yuv.z = max ((unsigned int)(dot (rgb, vcoeff) + offset[2]), 0);\n"
542 " yuv.x = min (yuv.x, max_val);\n"
543 " yuv.y = min (yuv.y, max_val);\n"
544 " yuv.z = min (yuv.z, max_val);\n"
545 " return yuv;\n"
546 "}\n"
547 "\n"
548 /* __global__ void
549 * GST_CUDA_KERNEL_FUNC_TO_ARGB
550 */
551 "%s\n"
552 "\n"
553 /* __device__ ushort2
554 * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
555 */
556 "%s\n"
557 "\n"
558 /* __device__ void
559 * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,
560 * unsigned short v, int x, int y, int pstride, int stride, int mask);
561 */
562 "%s\n"
563 "\n"
564 "__global__ void\n"
565 GST_CUDA_KERNEL_FUNC_TO_Y444
566 "(cudaTextureObject_t srcRGB, unsigned char *dstY, int y_stride,\n"
567 " unsigned char *dstU, int u_stride, unsigned char *dstV, int v_stride,\n"
568 " int width, int height, int dst_pstride, int in_depth)\n"
569 "{\n"
570 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
571 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
572 " if (x_pos < width && y_pos < height) {\n"
573 " ushort4 argb = tex2D<ushort4>(srcRGB, x_pos, y_pos);\n"
574 " uint3 yuv;\n"
575 " yuv = rgb_to_yuv (argb.y, argb.z, argb.w, (1 << in_depth) - 1);\n"
576 " if (in_depth > 8) {\n"
577 " *(unsigned short *)&dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n"
578 " *(unsigned short *)&dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n"
579 " *(unsigned short *)&dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n"
580 " } else {\n"
581 " dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n"
582 " dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n"
583 " dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n"
584 " }\n"
585 " }\n"
586 "}\n"
587 "\n"
588 "__global__ void\n"
589 GST_CUDA_KERNEL_FUNC_Y444_TO_YUV
590 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
591 " unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n"
592 " int stride)\n"
593 "{\n"
594 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
595 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
596 " if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
597 " float src_xpos = SCALE_H * x_pos;\n"
598 " float src_ypos = SCALE_V * y_pos;\n"
599 " unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
600 " y = y >> IN_SHIFT;\n"
601 " y = do_scale_pixel (y);\n"
602 " y = y << OUT_SHIFT;\n"
603 " if (OUT_DEPTH > 8) {\n"
604 " *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n"
605 " } else {\n"
606 " dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n"
607 " }\n"
608 " }\n"
609 " if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n"
610 " float src_xpos = CHROMA_SCALE_H * x_pos;\n"
611 " float src_ypos = CHROMA_SCALE_V * y_pos;\n"
612 " unsigned short u, v;\n"
613 " ushort2 uv;\n"
614 " uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
615 " u = uv.x;\n"
616 " v = uv.y;\n"
617 " u = u >> IN_SHIFT;\n"
618 " v = v >> IN_SHIFT;\n"
619 " u = do_scale_pixel (u);\n"
620 " v = do_scale_pixel (v);\n"
621 " u = u << OUT_SHIFT;\n"
622 " v = v << OUT_SHIFT;\n"
623 " if (SWAP_UV) {\n"
624 " unsigned short tmp = u;\n"
625 " u = v;\n"
626 " v = tmp;\n"
627 " }\n"
628 " write_chroma (dst1,\n"
629 " dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, stride, MASK);\n"
630 " }\n"
631 "}\n"
632 "\n"
633 "}";
634
635 /* CUDA kernel source for from RGB to RGB conversion and scale */
636 static const gchar templ_RGB_to_RGB[] =
637 "extern \"C\"{\n"
638 "__constant__ float SCALE_H = %s;\n"
639 "__constant__ float SCALE_V = %s;\n"
640 "__constant__ int WIDTH = %d;\n"
641 "__constant__ int HEIGHT = %d;\n"
642 "__constant__ int IN_DEPTH = %d;\n"
643 "__constant__ int OUT_DEPTH = %d;\n"
644 "__constant__ int PSTRIDE = %d;\n"
645 "__constant__ int R_IDX = %d;\n"
646 "__constant__ int G_IDX = %d;\n"
647 "__constant__ int B_IDX = %d;\n"
648 "__constant__ int A_IDX = %d;\n"
649 "__constant__ int X_IDX = %d;\n"
650 "\n"
651 "__device__ unsigned short\n"
652 "do_scale_pixel (unsigned short val) \n"
653 "{\n"
654 " unsigned int diff;\n"
655 " if (OUT_DEPTH > IN_DEPTH) {\n"
656 " diff = OUT_DEPTH - IN_DEPTH;\n"
657 " return (val << diff) | (val >> (IN_DEPTH - diff));\n"
658 " } else if (IN_DEPTH > OUT_DEPTH) {\n"
659 " return val >> (IN_DEPTH - OUT_DEPTH);\n"
660 " }\n"
661 " return val;\n"
662 "}\n"
663 "\n"
664 /* __global__ void
665 * GST_CUDA_KERNEL_FUNC_TO_ARGB
666 */
667 "%s\n"
668 "\n"
669 /* convert ARGB or ARGB64 to other RGB formats with scale */
670 "__global__ void\n"
671 GST_CUDA_KERNEL_FUNC_SCALE_RGB
672 "(cudaTextureObject_t srcRGB, unsigned char *dstRGB, int dst_stride)\n"
673 "{\n"
674 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
675 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
676 " if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
677 " float src_xpos = SCALE_H * x_pos;\n"
678 " float src_ypos = SCALE_V * y_pos;\n"
679 " ushort4 argb = tex2D<ushort4>(srcRGB, src_xpos, src_ypos);\n"
680 " argb.x = do_scale_pixel(argb.x);\n"
681 " argb.y = do_scale_pixel(argb.y);\n"
682 " argb.z = do_scale_pixel(argb.z);\n"
683 " argb.w = do_scale_pixel(argb.w);\n"
684 /* FIXME: RGB10A2_LE or BGR10A2_LE only */
685 " if (OUT_DEPTH > 8) {\n"
686 " unsigned int packed_rgb = 0;\n"
687 " unsigned int a, r, g, b;"
688 " a = (argb.x >> 8) & 0x3;\n"
689 " r = argb.y & 0x3ff;\n"
690 " g = argb.z & 0x3ff;\n"
691 " b = argb.w & 0x3ff;\n"
692 /* A is always MSB, we support only little endian system */
693 " packed_rgb = a << 30;\n"
694 " packed_rgb |= (r << (30 - (R_IDX * 10)));\n"
695 " packed_rgb |= (g << (30 - (G_IDX * 10)));\n"
696 " packed_rgb |= (b << (30 - (B_IDX * 10)));\n"
697 " *(unsigned int *)&dstRGB[x_pos * 4 + y_pos * dst_stride] = packed_rgb;\n"
698 " } else {\n"
699 " if (A_IDX >= 0) {\n"
700 " argb.x = do_scale_pixel(argb.x);\n"
701 " dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * dst_stride] = argb.x;\n"
702 " } else if (X_IDX >= 0) {\n"
703 " dstRGB[x_pos * PSTRIDE + X_IDX + y_pos * dst_stride] = 0xff;\n"
704 " }\n"
705 " dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * dst_stride] = argb.y;\n"
706 " dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * dst_stride] = argb.z;\n"
707 " dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * dst_stride] = argb.w;\n"
708 " }\n"
709 " }\n"
710 "}\n"
711 "\n"
712 "}";
713 /* *INDENT-ON* */
714
715 typedef struct
716 {
717 gint R;
718 gint G;
719 gint B;
720 gint A;
721 gint X;
722 } GstCudaRGBOrder;
723
724 typedef struct
725 {
726 CUdeviceptr device_ptr;
727 gsize cuda_stride;
728 } GstCudaStageBuffer;
729
730 #define CONVERTER_MAX_NUM_FUNC 4
731
732 struct _GstCudaConverter
733 {
734 GstVideoInfo in_info;
735 GstVideoInfo out_info;
736 gboolean keep_size;
737
738 gint texture_alignment;
739
740 GstCudaContext *cuda_ctx;
741 CUmodule cuda_module;
742 CUfunction kernel_func[CONVERTER_MAX_NUM_FUNC];
743 const gchar *func_names[CONVERTER_MAX_NUM_FUNC];
744 gchar *kernel_source;
745 gchar *ptx;
746 GstCudaStageBuffer fallback_buffer[GST_VIDEO_MAX_PLANES];
747
748 gboolean (*convert) (GstCudaConverter * convert, const GstCudaMemory * src,
749 GstVideoInfo * in_info, GstCudaMemory * dst, GstVideoInfo * out_info,
750 CUstream cuda_stream);
751
752 const CUdeviceptr src;
753 GstVideoInfo *cur_in_info;
754
755 CUdeviceptr dest;
756 GstVideoInfo *cur_out_info;
757
758 /* rgb to {rgb, yuv} only */
759 GstCudaRGBOrder in_rgb_order;
760 GstCudaStageBuffer unpack_surface;
761 GstCudaStageBuffer y444_surface[GST_VIDEO_MAX_PLANES];
762 };
763
764 #define LOAD_CUDA_FUNC(module,func,name) G_STMT_START { \
765 if (!gst_cuda_result (CuModuleGetFunction (&(func), (module), name))) { \
766 GST_ERROR ("failed to get %s function", (name)); \
767 goto error; \
768 } \
769 } G_STMT_END
770
771 /**
772 * gst_cuda_converter_new:
773 * @in_info: a #GstVideoInfo
774 * @out_info: a #GstVideoInfo
775 * @cuda_ctx: (transfer none): a #GstCudaContext
776 *
777 * Create a new converter object to convert between @in_info and @out_info
778 * with @config.
779 *
780 * Returns: a #GstCudaConverter or %NULL if conversion is not possible.
781 */
782 GstCudaConverter *
gst_cuda_converter_new(GstVideoInfo * in_info,GstVideoInfo * out_info,GstCudaContext * cuda_ctx)783 gst_cuda_converter_new (GstVideoInfo * in_info, GstVideoInfo * out_info,
784 GstCudaContext * cuda_ctx)
785 {
786 GstCudaConverter *convert;
787 gint i;
788
789 g_return_val_if_fail (in_info != NULL, NULL);
790 g_return_val_if_fail (out_info != NULL, NULL);
791 g_return_val_if_fail (cuda_ctx != NULL, NULL);
792 /* we won't ever do framerate conversion */
793 g_return_val_if_fail (in_info->fps_n == out_info->fps_n, NULL);
794 g_return_val_if_fail (in_info->fps_d == out_info->fps_d, NULL);
795 /* we won't ever do deinterlace */
796 g_return_val_if_fail (in_info->interlace_mode == out_info->interlace_mode,
797 NULL);
798
799 convert = g_new0 (GstCudaConverter, 1);
800
801 convert->in_info = *in_info;
802 convert->out_info = *out_info;
803
804 /* FIXME: should return kernel source */
805 if (!gst_cuda_context_push (cuda_ctx)) {
806 GST_ERROR ("cannot push context");
807 goto error;
808 }
809
810 if (!cuda_converter_lookup_path (convert))
811 goto error;
812
813 convert->ptx = gst_cuda_nvrtc_compile (convert->kernel_source);
814 if (!convert->ptx) {
815 GST_ERROR ("no PTX data to load");
816 goto error;
817 }
818
819 GST_TRACE ("compiled convert ptx \n%s", convert->ptx);
820
821 if (!gst_cuda_result (CuModuleLoadData (&convert->cuda_module, convert->ptx))) {
822 gst_cuda_context_pop (NULL);
823 GST_ERROR ("failed to load cuda module data");
824
825 goto error;
826 }
827
828 for (i = 0; i < CONVERTER_MAX_NUM_FUNC; i++) {
829 if (!convert->func_names[i])
830 break;
831
832 LOAD_CUDA_FUNC (convert->cuda_module, convert->kernel_func[i],
833 convert->func_names[i]);
834 GST_DEBUG ("kernel function \"%s\" loaded", convert->func_names[i]);
835 }
836
837 gst_cuda_context_pop (NULL);
838 convert->cuda_ctx = gst_object_ref (cuda_ctx);
839 convert->texture_alignment =
840 gst_cuda_context_get_texture_alignment (cuda_ctx);
841
842 g_free (convert->kernel_source);
843 g_free (convert->ptx);
844 convert->kernel_source = NULL;
845 convert->ptx = NULL;
846
847 return convert;
848
849 error:
850 gst_cuda_context_pop (NULL);
851 gst_cuda_converter_free (convert);
852
853 return NULL;
854 }
855
856 /**
857 * gst_video_converter_free:
858 * @convert: a #GstCudaConverter
859 *
860 * Free @convert
861 */
862 void
gst_cuda_converter_free(GstCudaConverter * convert)863 gst_cuda_converter_free (GstCudaConverter * convert)
864 {
865 g_return_if_fail (convert != NULL);
866
867 if (convert->cuda_ctx) {
868 if (gst_cuda_context_push (convert->cuda_ctx)) {
869 gint i;
870
871 if (convert->cuda_module) {
872 gst_cuda_result (CuModuleUnload (convert->cuda_module));
873 }
874
875 for (i = 0; i < GST_VIDEO_MAX_PLANES; i++) {
876 if (convert->fallback_buffer[i].device_ptr)
877 gst_cuda_result (CuMemFree (convert->fallback_buffer[i].device_ptr));
878 if (convert->y444_surface[i].device_ptr)
879 gst_cuda_result (CuMemFree (convert->y444_surface[i].device_ptr));
880 }
881
882 if (convert->unpack_surface.device_ptr)
883 gst_cuda_result (CuMemFree (convert->unpack_surface.device_ptr));
884
885 gst_cuda_context_pop (NULL);
886 }
887
888 gst_object_unref (convert->cuda_ctx);
889 }
890
891 g_free (convert->kernel_source);
892 g_free (convert->ptx);
893 g_free (convert);
894 }
895
896 /**
897 * gst_cuda_converter_frame:
898 * @convert: a #GstCudaConverter
899 * @src: a #GstCudaMemory
900 * @in_info: a #GstVideoInfo representing @src
901 * @dst: a #GstCudaMemory
902 * @out_info: a #GstVideoInfo representing @dst
903 * @cuda_stream: a #CUstream
904 *
905 * Convert the pixels of @src into @dest using @convert.
906 * Called without gst_cuda_context_push() and gst_cuda_context_pop() by caller
907 */
908 gboolean
gst_cuda_converter_frame(GstCudaConverter * convert,const GstCudaMemory * src,GstVideoInfo * in_info,GstCudaMemory * dst,GstVideoInfo * out_info,CUstream cuda_stream)909 gst_cuda_converter_frame (GstCudaConverter * convert, const GstCudaMemory * src,
910 GstVideoInfo * in_info, GstCudaMemory * dst, GstVideoInfo * out_info,
911 CUstream cuda_stream)
912 {
913 gboolean ret;
914
915 g_return_val_if_fail (convert, FALSE);
916 g_return_val_if_fail (src, FALSE);
917 g_return_val_if_fail (in_info, FALSE);
918 g_return_val_if_fail (dst, FALSE);
919 g_return_val_if_fail (out_info, FALSE);
920
921 gst_cuda_context_push (convert->cuda_ctx);
922
923 ret = gst_cuda_converter_frame_unlocked (convert,
924 src, in_info, dst, out_info, cuda_stream);
925
926 gst_cuda_context_pop (NULL);
927
928 return ret;
929 }
930
931 /**
932 * gst_cuda_converter_frame_unlocked:
933 * @convert: a #GstCudaConverter
934 * @src: a #GstCudaMemory
935 * @in_info: a #GstVideoInfo representing @src
936 * @dst: a #GstCudaMemory
937 * @out_info: a #GstVideoInfo representing @dest
938 * @cuda_stream: a #CUstream
939 *
940 * Convert the pixels of @src into @dest using @convert.
941 * Caller should call this method after gst_cuda_context_push()
942 */
943 gboolean
gst_cuda_converter_frame_unlocked(GstCudaConverter * convert,const GstCudaMemory * src,GstVideoInfo * in_info,GstCudaMemory * dst,GstVideoInfo * out_info,CUstream cuda_stream)944 gst_cuda_converter_frame_unlocked (GstCudaConverter * convert,
945 const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
946 GstVideoInfo * out_info, CUstream cuda_stream)
947 {
948 g_return_val_if_fail (convert, FALSE);
949 g_return_val_if_fail (src, FALSE);
950 g_return_val_if_fail (in_info, FALSE);
951 g_return_val_if_fail (dst, FALSE);
952 g_return_val_if_fail (out_info, FALSE);
953
954 return convert->convert (convert, src, in_info, dst, out_info, cuda_stream);
955 }
956
957 /* allocate fallback memory for texture alignment requirement */
958 static gboolean
convert_ensure_fallback_memory(GstCudaConverter * convert,GstVideoInfo * info,guint plane)959 convert_ensure_fallback_memory (GstCudaConverter * convert,
960 GstVideoInfo * info, guint plane)
961 {
962 CUresult ret;
963 guint element_size = 8;
964
965 if (convert->fallback_buffer[plane].device_ptr)
966 return TRUE;
967
968 if (GST_VIDEO_INFO_COMP_DEPTH (info, 0) > 8)
969 element_size = 16;
970
971 ret = CuMemAllocPitch (&convert->fallback_buffer[plane].device_ptr,
972 &convert->fallback_buffer[plane].cuda_stride,
973 GST_VIDEO_INFO_COMP_WIDTH (info, plane) *
974 GST_VIDEO_INFO_COMP_PSTRIDE (info, plane),
975 GST_VIDEO_INFO_COMP_HEIGHT (info, plane), element_size);
976
977 if (!gst_cuda_result (ret)) {
978 GST_ERROR ("failed to allocated fallback memory");
979 return FALSE;
980 }
981
982 return TRUE;
983 }
984
985 /* create a 2D CUDA texture without alignment check */
986 static CUtexObject
convert_create_texture_unchecked(const CUdeviceptr src,gint width,gint height,gint channels,gint stride,CUarray_format format,CUfilter_mode mode,CUstream cuda_stream)987 convert_create_texture_unchecked (const CUdeviceptr src, gint width,
988 gint height, gint channels, gint stride, CUarray_format format,
989 CUfilter_mode mode, CUstream cuda_stream)
990 {
991 CUDA_TEXTURE_DESC texture_desc;
992 CUDA_RESOURCE_DESC resource_desc;
993 CUtexObject texture = 0;
994 CUresult cuda_ret;
995
996 memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC));
997 memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC));
998
999 resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D;
1000 resource_desc.res.pitch2D.format = format;
1001 resource_desc.res.pitch2D.numChannels = channels;
1002 resource_desc.res.pitch2D.width = width;
1003 resource_desc.res.pitch2D.height = height;
1004 resource_desc.res.pitch2D.pitchInBytes = stride;
1005 resource_desc.res.pitch2D.devPtr = src;
1006
1007 texture_desc.filterMode = mode;
1008 texture_desc.flags = CU_TRSF_READ_AS_INTEGER;
1009
1010 gst_cuda_result (CuStreamSynchronize (cuda_stream));
1011 cuda_ret = CuTexObjectCreate (&texture, &resource_desc, &texture_desc, NULL);
1012
1013 if (!gst_cuda_result (cuda_ret)) {
1014 GST_ERROR ("couldn't create texture");
1015
1016 return 0;
1017 }
1018
1019 return texture;
1020 }
1021
1022 static CUtexObject
convert_create_texture(GstCudaConverter * convert,const GstCudaMemory * src,GstVideoInfo * info,guint plane,CUstream cuda_stream)1023 convert_create_texture (GstCudaConverter * convert, const GstCudaMemory * src,
1024 GstVideoInfo * info, guint plane, CUstream cuda_stream)
1025 {
1026 CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
1027 guint channels = 1;
1028 CUdeviceptr src_ptr;
1029 gsize stride;
1030 CUresult cuda_ret;
1031 CUfilter_mode mode;
1032
1033 if (GST_VIDEO_INFO_COMP_DEPTH (info, plane) > 8)
1034 format = CU_AD_FORMAT_UNSIGNED_INT16;
1035
1036 /* FIXME: more graceful method ? */
1037 if (plane != 0 &&
1038 GST_VIDEO_INFO_N_PLANES (info) != GST_VIDEO_INFO_N_COMPONENTS (info)) {
1039 channels = 2;
1040 }
1041
1042 src_ptr = src->data + src->offset[plane];
1043 stride = src->stride;
1044
1045 if (convert->texture_alignment && (src_ptr % convert->texture_alignment)) {
1046 CUDA_MEMCPY2D copy_params = { 0, };
1047
1048 if (!convert_ensure_fallback_memory (convert, info, plane))
1049 return 0;
1050
1051 GST_LOG ("device memory was not aligned, copy to fallback memory");
1052
1053 copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1054 copy_params.srcPitch = stride;
1055 copy_params.srcDevice = (CUdeviceptr) src_ptr;
1056
1057 copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1058 copy_params.dstPitch = convert->fallback_buffer[plane].cuda_stride;
1059 copy_params.dstDevice = convert->fallback_buffer[plane].device_ptr;
1060 copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, plane)
1061 * GST_VIDEO_INFO_COMP_PSTRIDE (info, plane);
1062 copy_params.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, plane);
1063
1064 cuda_ret = CuMemcpy2DAsync (©_params, cuda_stream);
1065 if (!gst_cuda_result (cuda_ret)) {
1066 GST_ERROR ("failed to copy to fallback buffer");
1067 return 0;
1068 }
1069
1070 src_ptr = convert->fallback_buffer[plane].device_ptr;
1071 stride = convert->fallback_buffer[plane].cuda_stride;
1072 }
1073
1074 /* Use h/w linear interpolation only when resize is required.
1075 * Otherwise the image might be blurred */
1076 if (convert->keep_size)
1077 mode = CU_TR_FILTER_MODE_POINT;
1078 else
1079 mode = CU_TR_FILTER_MODE_LINEAR;
1080
1081 return convert_create_texture_unchecked (src_ptr,
1082 GST_VIDEO_INFO_COMP_WIDTH (info, plane),
1083 GST_VIDEO_INFO_COMP_HEIGHT (info, plane), channels, stride, format, mode,
1084 cuda_stream);
1085 }
1086
1087 /* main conversion function for YUV to YUV conversion */
1088 static gboolean
convert_YUV_TO_YUV(GstCudaConverter * convert,const GstCudaMemory * src,GstVideoInfo * in_info,GstCudaMemory * dst,GstVideoInfo * out_info,CUstream cuda_stream)1089 convert_YUV_TO_YUV (GstCudaConverter * convert,
1090 const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1091 GstVideoInfo * out_info, CUstream cuda_stream)
1092 {
1093 CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, };
1094 CUresult cuda_ret;
1095 gboolean ret = FALSE;
1096 CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, };
1097 gint dst_stride;
1098 gint width, height;
1099 gint i;
1100
1101 gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2],
1102 &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride
1103 };
1104
1105 /* conversion step
1106 * STEP 1: create CUtexObject per plane
1107 * STEP 2: call YUV to YUV conversion kernel function.
1108 * resize, uv reordering and bitdepth conversion will be performed in
1109 * the CUDA kernel function
1110 */
1111
1112 /* map CUDA device memory to CUDA texture object */
1113 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1114 texture[i] = convert_create_texture (convert, src, in_info, i, cuda_stream);
1115 if (!texture[i]) {
1116 GST_ERROR ("couldn't create texture for %d th plane", i);
1117 goto done;
1118 }
1119 }
1120
1121 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (out_info); i++)
1122 dst_ptr[i] = dst->data + dst->offset[i];
1123
1124 dst_stride = dst->stride;
1125
1126 width = GST_VIDEO_INFO_WIDTH (out_info);
1127 height = GST_VIDEO_INFO_HEIGHT (out_info);
1128
1129 cuda_ret =
1130 CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X),
1131 DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1132 cuda_stream, kernel_args, NULL);
1133
1134 if (!gst_cuda_result (cuda_ret)) {
1135 GST_ERROR ("could not rescale plane");
1136 goto done;
1137 }
1138
1139 ret = TRUE;
1140 gst_cuda_result (CuStreamSynchronize (cuda_stream));
1141
1142 done:
1143 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1144 if (texture[i])
1145 gst_cuda_result (CuTexObjectDestroy (texture[i]));
1146 }
1147
1148 return ret;
1149 }
1150
1151 /* main conversion function for YUV to RGB conversion */
1152 static gboolean
convert_YUV_TO_RGB(GstCudaConverter * convert,const GstCudaMemory * src,GstVideoInfo * in_info,GstCudaMemory * dst,GstVideoInfo * out_info,CUstream cuda_stream)1153 convert_YUV_TO_RGB (GstCudaConverter * convert,
1154 const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1155 GstVideoInfo * out_info, CUstream cuda_stream)
1156 {
1157 CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, };
1158 CUresult cuda_ret;
1159 gboolean ret = FALSE;
1160 CUdeviceptr dstRGB = 0;
1161 gint dst_stride;
1162 gint width, height;
1163 gint i;
1164
1165 gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2],
1166 &dstRGB, &dst_stride
1167 };
1168
1169 /* conversion step
1170 * STEP 1: create CUtexObject per plane
1171 * STEP 2: call YUV to RGB conversion kernel function.
1172 * resizing, argb ordering and bitdepth conversion will be performed in
1173 * the CUDA kernel function
1174 */
1175
1176 /* map CUDA device memory to CUDA texture object */
1177 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1178 texture[i] = convert_create_texture (convert, src, in_info, i, cuda_stream);
1179 if (!texture[i]) {
1180 GST_ERROR ("couldn't create texture for %d th plane", i);
1181 goto done;
1182 }
1183 }
1184
1185 dstRGB = dst->data;
1186 dst_stride = dst->stride;
1187
1188 width = GST_VIDEO_INFO_WIDTH (out_info);
1189 height = GST_VIDEO_INFO_HEIGHT (out_info);
1190
1191 cuda_ret =
1192 CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X),
1193 DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1194 cuda_stream, kernel_args, NULL);
1195
1196 if (!gst_cuda_result (cuda_ret)) {
1197 GST_ERROR ("could not rescale plane");
1198 goto done;
1199 }
1200
1201 ret = TRUE;
1202 gst_cuda_result (CuStreamSynchronize (cuda_stream));
1203
1204 done:
1205 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1206 if (texture[i])
1207 gst_cuda_result (CuTexObjectDestroy (texture[i]));
1208 }
1209
1210 return ret;
1211 }
1212
1213 static gboolean
convert_UNPACK_RGB(GstCudaConverter * convert,CUfunction kernel_func,CUstream cuda_stream,const GstCudaMemory * src,GstVideoInfo * in_info,CUdeviceptr dst,gint dst_stride,GstCudaRGBOrder * rgb_order)1214 convert_UNPACK_RGB (GstCudaConverter * convert, CUfunction kernel_func,
1215 CUstream cuda_stream, const GstCudaMemory * src, GstVideoInfo * in_info,
1216 CUdeviceptr dst, gint dst_stride, GstCudaRGBOrder * rgb_order)
1217 {
1218 CUdeviceptr srcRGB = 0;
1219 gint width, height;
1220 gint src_stride, src_pstride;
1221 CUresult cuda_ret;
1222
1223 gpointer unpack_kernel_args[] = { &srcRGB, &dst,
1224 &width, &height,
1225 &src_stride, &src_pstride, &dst_stride,
1226 &convert->in_rgb_order.R, &convert->in_rgb_order.G,
1227 &convert->in_rgb_order.B, &convert->in_rgb_order.A,
1228 };
1229
1230 srcRGB = src->data;
1231 src_stride = src->stride;
1232
1233 width = GST_VIDEO_INFO_WIDTH (in_info);
1234 height = GST_VIDEO_INFO_HEIGHT (in_info);
1235 src_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (in_info, 0);
1236
1237 cuda_ret =
1238 CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X),
1239 DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1240 cuda_stream, unpack_kernel_args, NULL);
1241
1242 if (!gst_cuda_result (cuda_ret)) {
1243 GST_ERROR ("could not unpack rgb");
1244 return FALSE;
1245 }
1246
1247 return TRUE;
1248 }
1249
1250 static gboolean
convert_TO_Y444(GstCudaConverter * convert,CUfunction kernel_func,CUstream cuda_stream,CUtexObject srcRGB,CUdeviceptr dstY,gint y_stride,CUdeviceptr dstU,gint u_stride,CUdeviceptr dstV,gint v_stride,gint width,gint height,gint pstride,gint bitdepth)1251 convert_TO_Y444 (GstCudaConverter * convert, CUfunction kernel_func,
1252 CUstream cuda_stream, CUtexObject srcRGB, CUdeviceptr dstY, gint y_stride,
1253 CUdeviceptr dstU, gint u_stride, CUdeviceptr dstV, gint v_stride,
1254 gint width, gint height, gint pstride, gint bitdepth)
1255 {
1256 CUresult cuda_ret;
1257
1258 gpointer kernel_args[] = { &srcRGB, &dstY, &y_stride, &dstU, &u_stride, &dstV,
1259 &v_stride, &width, &height, &pstride, &bitdepth,
1260 };
1261
1262 cuda_ret =
1263 CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X),
1264 DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1265 cuda_stream, kernel_args, NULL);
1266
1267 if (!gst_cuda_result (cuda_ret)) {
1268 GST_ERROR ("could not unpack rgb");
1269 return FALSE;
1270 }
1271
1272 return TRUE;
1273 }
1274
1275 /* main conversion function for RGB to YUV conversion */
1276 static gboolean
convert_RGB_TO_YUV(GstCudaConverter * convert,const GstCudaMemory * src,GstVideoInfo * in_info,GstCudaMemory * dst,GstVideoInfo * out_info,CUstream cuda_stream)1277 convert_RGB_TO_YUV (GstCudaConverter * convert,
1278 const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1279 GstVideoInfo * out_info, CUstream cuda_stream)
1280 {
1281 CUtexObject texture = 0;
1282 CUtexObject yuv_texture[3] = { 0, };
1283 CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, };
1284 CUresult cuda_ret;
1285 gboolean ret = FALSE;
1286 gint in_width, in_height;
1287 gint out_width, out_height;
1288 gint dst_stride;
1289 CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
1290 CUfilter_mode mode = CU_TR_FILTER_MODE_POINT;
1291 gint pstride = 1;
1292 gint bitdepth = 8;
1293 gint i;
1294
1295 gpointer kernel_args[] = { &yuv_texture[0], &yuv_texture[1], &yuv_texture[2],
1296 &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride
1297 };
1298
1299 /* conversion step
1300 * STEP 1: unpack src RGB into ARGB or ARGB64 format
1301 * STEP 2: convert unpacked ARGB (or ARGB64) to Y444 (or Y444_16LE)
1302 * STEP 3: convert Y444 (or Y444_16LE) to final YUV format.
1303 * resizing, bitdepth conversion, uv reordering will be performed in
1304 * the CUDA kernel function
1305 */
1306 if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream,
1307 src, in_info, convert->unpack_surface.device_ptr,
1308 convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) {
1309 GST_ERROR ("could not unpack input rgb");
1310
1311 goto done;
1312 }
1313
1314 in_width = GST_VIDEO_INFO_WIDTH (in_info);
1315 in_height = GST_VIDEO_INFO_HEIGHT (in_info);
1316
1317 out_width = GST_VIDEO_INFO_WIDTH (out_info);
1318 out_height = GST_VIDEO_INFO_HEIGHT (out_info);
1319 dst_stride = dst->stride;
1320
1321 if (GST_VIDEO_INFO_COMP_DEPTH (in_info, 0) > 8) {
1322 pstride = 2;
1323 bitdepth = 16;
1324 format = CU_AD_FORMAT_UNSIGNED_INT16;
1325 }
1326
1327 texture =
1328 convert_create_texture_unchecked (convert->unpack_surface.device_ptr,
1329 in_width, in_height, 4, convert->unpack_surface.cuda_stride, format,
1330 mode, cuda_stream);
1331
1332 if (!texture) {
1333 GST_ERROR ("could not create texture");
1334 goto done;
1335 }
1336
1337 if (!convert_TO_Y444 (convert, convert->kernel_func[1], cuda_stream, texture,
1338 convert->y444_surface[0].device_ptr,
1339 convert->y444_surface[0].cuda_stride,
1340 convert->y444_surface[1].device_ptr,
1341 convert->y444_surface[1].cuda_stride,
1342 convert->y444_surface[2].device_ptr,
1343 convert->y444_surface[2].cuda_stride, in_width, in_height, pstride,
1344 bitdepth)) {
1345 GST_ERROR ("could not convert to Y444 or Y444_16LE");
1346 goto done;
1347 }
1348
1349 /* Use h/w linear interpolation only when resize is required.
1350 * Otherwise the image might be blurred */
1351 if (convert->keep_size)
1352 mode = CU_TR_FILTER_MODE_POINT;
1353 else
1354 mode = CU_TR_FILTER_MODE_LINEAR;
1355
1356 for (i = 0; i < 3; i++) {
1357 yuv_texture[i] =
1358 convert_create_texture_unchecked (convert->y444_surface[i].device_ptr,
1359 in_width, in_height, 1, convert->y444_surface[i].cuda_stride, format,
1360 mode, cuda_stream);
1361
1362 if (!yuv_texture[i]) {
1363 GST_ERROR ("could not create %dth yuv texture", i);
1364 goto done;
1365 }
1366 }
1367
1368 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (out_info); i++)
1369 dst_ptr[i] = dst->data + dst->offset[i];
1370
1371 cuda_ret =
1372 CuLaunchKernel (convert->kernel_func[2], DIV_UP (out_width, CUDA_BLOCK_X),
1373 DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1374 cuda_stream, kernel_args, NULL);
1375
1376 if (!gst_cuda_result (cuda_ret)) {
1377 GST_ERROR ("could not rescale plane");
1378 goto done;
1379 }
1380
1381 ret = TRUE;
1382 gst_cuda_result (CuStreamSynchronize (cuda_stream));
1383
1384 done:
1385 if (texture)
1386 gst_cuda_result (CuTexObjectDestroy (texture));
1387 for (i = 0; i < 3; i++) {
1388 if (yuv_texture[i])
1389 gst_cuda_result (CuTexObjectDestroy (yuv_texture[i]));
1390 }
1391
1392 return ret;
1393 }
1394
1395 /* main conversion function for RGB to RGB conversion */
1396 static gboolean
convert_RGB_TO_RGB(GstCudaConverter * convert,const GstCudaMemory * src,GstVideoInfo * in_info,GstCudaMemory * dst,GstVideoInfo * out_info,CUstream cuda_stream)1397 convert_RGB_TO_RGB (GstCudaConverter * convert,
1398 const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1399 GstVideoInfo * out_info, CUstream cuda_stream)
1400 {
1401 CUtexObject texture = 0;
1402 CUresult cuda_ret;
1403 gboolean ret = FALSE;
1404 CUdeviceptr dstRGB = 0;
1405 gint in_width, in_height;
1406 gint out_width, out_height;
1407 gint dst_stride;
1408 CUfilter_mode mode;
1409 CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
1410
1411 gpointer rescale_kernel_args[] = { &texture, &dstRGB, &dst_stride };
1412
1413 /* conversion step
1414 * STEP 1: unpack src RGB into ARGB or ARGB64 format
1415 * STEP 2: convert ARGB (or ARGB64) to final RGB format.
1416 * resizing, bitdepth conversion, argb reordering will be performed in
1417 * the CUDA kernel function
1418 */
1419
1420 if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream,
1421 src, in_info, convert->unpack_surface.device_ptr,
1422 convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) {
1423 GST_ERROR ("could not unpack input rgb");
1424
1425 goto done;
1426 }
1427
1428 in_width = GST_VIDEO_INFO_WIDTH (in_info);
1429 in_height = GST_VIDEO_INFO_HEIGHT (in_info);
1430
1431 out_width = GST_VIDEO_INFO_WIDTH (out_info);
1432 out_height = GST_VIDEO_INFO_HEIGHT (out_info);
1433
1434 dstRGB = dst->data;
1435 dst_stride = dst->stride;
1436
1437 if (GST_VIDEO_INFO_COMP_DEPTH (in_info, 0) > 8)
1438 format = CU_AD_FORMAT_UNSIGNED_INT16;
1439
1440 /* Use h/w linear interpolation only when resize is required.
1441 * Otherwise the image might be blurred */
1442 if (convert->keep_size)
1443 mode = CU_TR_FILTER_MODE_POINT;
1444 else
1445 mode = CU_TR_FILTER_MODE_LINEAR;
1446
1447 texture =
1448 convert_create_texture_unchecked (convert->unpack_surface.device_ptr,
1449 in_width, in_height, 4, convert->unpack_surface.cuda_stride, format,
1450 mode, cuda_stream);
1451
1452 if (!texture) {
1453 GST_ERROR ("could not create texture");
1454 goto done;
1455 }
1456
1457 cuda_ret =
1458 CuLaunchKernel (convert->kernel_func[1], DIV_UP (out_width, CUDA_BLOCK_X),
1459 DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1460 cuda_stream, rescale_kernel_args, NULL);
1461
1462 if (!gst_cuda_result (cuda_ret)) {
1463 GST_ERROR ("could not rescale plane");
1464 goto done;
1465 }
1466
1467 ret = TRUE;
1468 gst_cuda_result (CuStreamSynchronize (cuda_stream));
1469
1470 done:
1471 if (texture)
1472 gst_cuda_result (CuTexObjectDestroy (texture));
1473
1474 return ret;
1475 }
1476
1477 /* from video-converter.c */
1478 typedef struct
1479 {
1480 gdouble dm[4][4];
1481 } MatrixData;
1482
1483 static void
color_matrix_set_identity(MatrixData * m)1484 color_matrix_set_identity (MatrixData * m)
1485 {
1486 gint i, j;
1487
1488 for (i = 0; i < 4; i++) {
1489 for (j = 0; j < 4; j++) {
1490 m->dm[i][j] = (i == j);
1491 }
1492 }
1493 }
1494
1495 static void
color_matrix_copy(MatrixData * d,const MatrixData * s)1496 color_matrix_copy (MatrixData * d, const MatrixData * s)
1497 {
1498 gint i, j;
1499
1500 for (i = 0; i < 4; i++)
1501 for (j = 0; j < 4; j++)
1502 d->dm[i][j] = s->dm[i][j];
1503 }
1504
1505 /* Perform 4x4 matrix multiplication:
1506 * - @dst@ = @a@ * @b@
1507 * - @dst@ may be a pointer to @a@ andor @b@
1508 */
1509 static void
color_matrix_multiply(MatrixData * dst,MatrixData * a,MatrixData * b)1510 color_matrix_multiply (MatrixData * dst, MatrixData * a, MatrixData * b)
1511 {
1512 MatrixData tmp;
1513 gint i, j, k;
1514
1515 for (i = 0; i < 4; i++) {
1516 for (j = 0; j < 4; j++) {
1517 gdouble x = 0;
1518 for (k = 0; k < 4; k++) {
1519 x += a->dm[i][k] * b->dm[k][j];
1520 }
1521 tmp.dm[i][j] = x;
1522 }
1523 }
1524 color_matrix_copy (dst, &tmp);
1525 }
1526
1527 static void
color_matrix_offset_components(MatrixData * m,gdouble a1,gdouble a2,gdouble a3)1528 color_matrix_offset_components (MatrixData * m, gdouble a1, gdouble a2,
1529 gdouble a3)
1530 {
1531 MatrixData a;
1532
1533 color_matrix_set_identity (&a);
1534 a.dm[0][3] = a1;
1535 a.dm[1][3] = a2;
1536 a.dm[2][3] = a3;
1537 color_matrix_multiply (m, &a, m);
1538 }
1539
1540 static void
color_matrix_scale_components(MatrixData * m,gdouble a1,gdouble a2,gdouble a3)1541 color_matrix_scale_components (MatrixData * m, gdouble a1, gdouble a2,
1542 gdouble a3)
1543 {
1544 MatrixData a;
1545
1546 color_matrix_set_identity (&a);
1547 a.dm[0][0] = a1;
1548 a.dm[1][1] = a2;
1549 a.dm[2][2] = a3;
1550 color_matrix_multiply (m, &a, m);
1551 }
1552
1553 static void
color_matrix_debug(const MatrixData * s)1554 color_matrix_debug (const MatrixData * s)
1555 {
1556 GST_DEBUG ("[%f %f %f %f]", s->dm[0][0], s->dm[0][1], s->dm[0][2],
1557 s->dm[0][3]);
1558 GST_DEBUG ("[%f %f %f %f]", s->dm[1][0], s->dm[1][1], s->dm[1][2],
1559 s->dm[1][3]);
1560 GST_DEBUG ("[%f %f %f %f]", s->dm[2][0], s->dm[2][1], s->dm[2][2],
1561 s->dm[2][3]);
1562 GST_DEBUG ("[%f %f %f %f]", s->dm[3][0], s->dm[3][1], s->dm[3][2],
1563 s->dm[3][3]);
1564 }
1565
1566 static void
color_matrix_YCbCr_to_RGB(MatrixData * m,gdouble Kr,gdouble Kb)1567 color_matrix_YCbCr_to_RGB (MatrixData * m, gdouble Kr, gdouble Kb)
1568 {
1569 gdouble Kg = 1.0 - Kr - Kb;
1570 MatrixData k = {
1571 {
1572 {1., 0., 2 * (1 - Kr), 0.},
1573 {1., -2 * Kb * (1 - Kb) / Kg, -2 * Kr * (1 - Kr) / Kg, 0.},
1574 {1., 2 * (1 - Kb), 0., 0.},
1575 {0., 0., 0., 1.},
1576 }
1577 };
1578
1579 color_matrix_multiply (m, &k, m);
1580 }
1581
1582 static void
color_matrix_RGB_to_YCbCr(MatrixData * m,gdouble Kr,gdouble Kb)1583 color_matrix_RGB_to_YCbCr (MatrixData * m, gdouble Kr, gdouble Kb)
1584 {
1585 gdouble Kg = 1.0 - Kr - Kb;
1586 MatrixData k;
1587 gdouble x;
1588
1589 k.dm[0][0] = Kr;
1590 k.dm[0][1] = Kg;
1591 k.dm[0][2] = Kb;
1592 k.dm[0][3] = 0;
1593
1594 x = 1 / (2 * (1 - Kb));
1595 k.dm[1][0] = -x * Kr;
1596 k.dm[1][1] = -x * Kg;
1597 k.dm[1][2] = x * (1 - Kb);
1598 k.dm[1][3] = 0;
1599
1600 x = 1 / (2 * (1 - Kr));
1601 k.dm[2][0] = x * (1 - Kr);
1602 k.dm[2][1] = -x * Kg;
1603 k.dm[2][2] = -x * Kb;
1604 k.dm[2][3] = 0;
1605
1606 k.dm[3][0] = 0;
1607 k.dm[3][1] = 0;
1608 k.dm[3][2] = 0;
1609 k.dm[3][3] = 1;
1610
1611 color_matrix_multiply (m, &k, m);
1612 }
1613
1614 static void
compute_matrix_to_RGB(GstCudaConverter * convert,MatrixData * data,GstVideoInfo * info)1615 compute_matrix_to_RGB (GstCudaConverter * convert, MatrixData * data,
1616 GstVideoInfo * info)
1617 {
1618 gdouble Kr = 0, Kb = 0;
1619 gint offset[4], scale[4];
1620
1621 /* bring color components to [0..1.0] range */
1622 gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset,
1623 scale);
1624
1625 color_matrix_offset_components (data, -offset[0], -offset[1], -offset[2]);
1626 color_matrix_scale_components (data, 1 / ((float) scale[0]),
1627 1 / ((float) scale[1]), 1 / ((float) scale[2]));
1628
1629 if (!GST_VIDEO_INFO_IS_RGB (info)) {
1630 /* bring components to R'G'B' space */
1631 if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb))
1632 color_matrix_YCbCr_to_RGB (data, Kr, Kb);
1633 }
1634 color_matrix_debug (data);
1635 }
1636
1637 static void
compute_matrix_to_YUV(GstCudaConverter * convert,MatrixData * data,GstVideoInfo * info)1638 compute_matrix_to_YUV (GstCudaConverter * convert, MatrixData * data,
1639 GstVideoInfo * info)
1640 {
1641 gdouble Kr = 0, Kb = 0;
1642 gint offset[4], scale[4];
1643
1644 if (!GST_VIDEO_INFO_IS_RGB (info)) {
1645 /* bring components to YCbCr space */
1646 if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb))
1647 color_matrix_RGB_to_YCbCr (data, Kr, Kb);
1648 }
1649
1650 /* bring color components to nominal range */
1651 gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset,
1652 scale);
1653
1654 color_matrix_scale_components (data, (float) scale[0], (float) scale[1],
1655 (float) scale[2]);
1656 color_matrix_offset_components (data, offset[0], offset[1], offset[2]);
1657
1658 color_matrix_debug (data);
1659 }
1660
1661 static gboolean
cuda_converter_get_matrix(GstCudaConverter * convert,MatrixData * matrix,GstVideoInfo * in_info,GstVideoInfo * out_info)1662 cuda_converter_get_matrix (GstCudaConverter * convert, MatrixData * matrix,
1663 GstVideoInfo * in_info, GstVideoInfo * out_info)
1664 {
1665 gboolean same_matrix, same_bits;
1666 guint in_bits, out_bits;
1667
1668 in_bits = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0);
1669 out_bits = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0);
1670
1671 same_bits = in_bits == out_bits;
1672 same_matrix = in_info->colorimetry.matrix == out_info->colorimetry.matrix;
1673
1674 GST_DEBUG ("matrix %d -> %d (%d)", in_info->colorimetry.matrix,
1675 out_info->colorimetry.matrix, same_matrix);
1676 GST_DEBUG ("bits %d -> %d (%d)", in_bits, out_bits, same_bits);
1677
1678 color_matrix_set_identity (matrix);
1679
1680 if (same_bits && same_matrix) {
1681 GST_DEBUG ("conversion matrix is not required");
1682
1683 return FALSE;
1684 }
1685
1686 if (in_bits < out_bits) {
1687 gint scale = 1 << (out_bits - in_bits);
1688 color_matrix_scale_components (matrix,
1689 1 / (float) scale, 1 / (float) scale, 1 / (float) scale);
1690 }
1691
1692 GST_DEBUG ("to RGB matrix");
1693 compute_matrix_to_RGB (convert, matrix, in_info);
1694 GST_DEBUG ("current matrix");
1695 color_matrix_debug (matrix);
1696
1697 GST_DEBUG ("to YUV matrix");
1698 compute_matrix_to_YUV (convert, matrix, out_info);
1699 GST_DEBUG ("current matrix");
1700 color_matrix_debug (matrix);
1701
1702 if (in_bits > out_bits) {
1703 gint scale = 1 << (in_bits - out_bits);
1704 color_matrix_scale_components (matrix,
1705 (float) scale, (float) scale, (float) scale);
1706 }
1707
1708 GST_DEBUG ("final matrix");
1709 color_matrix_debug (matrix);
1710
1711 return TRUE;
1712 }
1713
1714 static gboolean
is_uv_swapped(GstVideoFormat format)1715 is_uv_swapped (GstVideoFormat format)
1716 {
1717 static GstVideoFormat swapped_formats[] = {
1718 GST_VIDEO_FORMAT_YV12,
1719 GST_VIDEO_FORMAT_NV21,
1720 };
1721 gint i;
1722
1723 for (i = 0; i < G_N_ELEMENTS (swapped_formats); i++) {
1724 if (format == swapped_formats[i])
1725 return TRUE;
1726 }
1727
1728 return FALSE;
1729 }
1730
1731 typedef struct
1732 {
1733 const gchar *read_chroma;
1734 const gchar *write_chroma;
1735 const gchar *unpack_function;
1736 gfloat scale_h, scale_v;
1737 gfloat chroma_scale_h, chroma_scale_v;
1738 gint width, height;
1739 gint chroma_width, chroma_height;
1740 gint in_depth;
1741 gint out_depth;
1742 gint pstride, chroma_pstride;
1743 gint in_shift, out_shift;
1744 gint mask;
1745 gint swap_uv;
1746 /* RGBA specific variables */
1747 gint max_in_val;
1748 GstCudaRGBOrder rgb_order;
1749 } GstCudaKernelTempl;
1750
1751 static gchar *
cuda_converter_generate_yuv_to_yuv_kernel_code(GstCudaConverter * convert,GstCudaKernelTempl * templ)1752 cuda_converter_generate_yuv_to_yuv_kernel_code (GstCudaConverter * convert,
1753 GstCudaKernelTempl * templ)
1754 {
1755 gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
1756 gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
1757 gchar chroma_scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
1758 gchar chroma_scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
1759 g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h);
1760 g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v);
1761 g_ascii_formatd (chroma_scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
1762 templ->chroma_scale_h);
1763 g_ascii_formatd (chroma_scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
1764 templ->chroma_scale_v);
1765 return g_strdup_printf (templ_YUV_TO_YUV, scale_h_str, scale_v_str,
1766 chroma_scale_h_str, chroma_scale_v_str, templ->width, templ->height,
1767 templ->chroma_width, templ->chroma_height, templ->in_depth,
1768 templ->out_depth, templ->pstride, templ->chroma_pstride, templ->in_shift,
1769 templ->out_shift, templ->mask, templ->swap_uv, templ->read_chroma,
1770 templ->write_chroma);
1771 }
1772
1773 static gchar *
cuda_converter_generate_yuv_to_rgb_kernel_code(GstCudaConverter * convert,GstCudaKernelTempl * templ,MatrixData * matrix)1774 cuda_converter_generate_yuv_to_rgb_kernel_code (GstCudaConverter * convert,
1775 GstCudaKernelTempl * templ, MatrixData * matrix)
1776 {
1777 gchar matrix_dm[4][4][G_ASCII_DTOSTR_BUF_SIZE];
1778 gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
1779 gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
1780 gchar chroma_scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
1781 gchar chroma_scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
1782 gint i, j;
1783 for (i = 0; i < 4; i++) {
1784 for (j = 0; j < 4; j++) {
1785 g_ascii_formatd (matrix_dm[i][j], G_ASCII_DTOSTR_BUF_SIZE, "%f",
1786 matrix->dm[i][j]);
1787 }
1788 }
1789 g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h);
1790 g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v);
1791 g_ascii_formatd (chroma_scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
1792 templ->chroma_scale_h);
1793 g_ascii_formatd (chroma_scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
1794 templ->chroma_scale_v);
1795 return g_strdup_printf (templ_YUV_TO_RGB, matrix_dm[0][3], matrix_dm[1][3],
1796 matrix_dm[2][3], matrix_dm[0][0], matrix_dm[0][1], matrix_dm[0][2],
1797 matrix_dm[1][0], matrix_dm[1][1], matrix_dm[1][2], matrix_dm[2][0],
1798 matrix_dm[2][1], matrix_dm[2][2], scale_h_str, scale_v_str,
1799 chroma_scale_h_str, chroma_scale_v_str, templ->width, templ->height,
1800 templ->chroma_width, templ->chroma_height, templ->in_depth,
1801 templ->out_depth, templ->pstride, templ->chroma_pstride, templ->in_shift,
1802 templ->out_shift, templ->mask, templ->swap_uv, templ->max_in_val,
1803 templ->rgb_order.R, templ->rgb_order.G, templ->rgb_order.B,
1804 templ->rgb_order.A, templ->rgb_order.X, templ->read_chroma);
1805 }
1806
1807 static gchar *
cuda_converter_generate_rgb_to_yuv_kernel_code(GstCudaConverter * convert,GstCudaKernelTempl * templ,MatrixData * matrix)1808 cuda_converter_generate_rgb_to_yuv_kernel_code (GstCudaConverter * convert,
1809 GstCudaKernelTempl * templ, MatrixData * matrix)
1810 {
1811 gchar matrix_dm[4][4][G_ASCII_DTOSTR_BUF_SIZE];
1812 gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
1813 gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
1814 gchar chroma_scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
1815 gchar chroma_scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
1816 gint i, j;
1817 for (i = 0; i < 4; i++) {
1818 for (j = 0; j < 4; j++) {
1819 g_ascii_formatd (matrix_dm[i][j], G_ASCII_DTOSTR_BUF_SIZE, "%f",
1820 matrix->dm[i][j]);
1821 }
1822 }
1823 g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h);
1824 g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v);
1825 g_ascii_formatd (chroma_scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
1826 templ->chroma_scale_h);
1827 g_ascii_formatd (chroma_scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
1828 templ->chroma_scale_v);
1829 return g_strdup_printf (templ_RGB_TO_YUV, matrix_dm[0][3], matrix_dm[1][3],
1830 matrix_dm[2][3], matrix_dm[0][0], matrix_dm[0][1], matrix_dm[0][2],
1831 matrix_dm[1][0], matrix_dm[1][1], matrix_dm[1][2], matrix_dm[2][0],
1832 matrix_dm[2][1], matrix_dm[2][2], scale_h_str, scale_v_str,
1833 chroma_scale_h_str, chroma_scale_v_str, templ->width, templ->height,
1834 templ->chroma_width, templ->chroma_height, templ->in_depth,
1835 templ->out_depth, templ->pstride, templ->chroma_pstride, templ->in_shift,
1836 templ->out_shift, templ->mask, templ->swap_uv, templ->unpack_function,
1837 templ->read_chroma, templ->write_chroma);
1838 }
1839
1840 static gchar *
cuda_converter_generate_rgb_to_rgb_kernel_code(GstCudaConverter * convert,GstCudaKernelTempl * templ)1841 cuda_converter_generate_rgb_to_rgb_kernel_code (GstCudaConverter * convert,
1842 GstCudaKernelTempl * templ)
1843 {
1844 gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
1845 gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
1846 g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h);
1847 g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v);
1848 return g_strdup_printf (templ_RGB_to_RGB,
1849 scale_h_str, scale_v_str,
1850 templ->width, templ->height,
1851 templ->in_depth, templ->out_depth, templ->pstride,
1852 templ->rgb_order.R, templ->rgb_order.G,
1853 templ->rgb_order.B, templ->rgb_order.A, templ->rgb_order.X,
1854 templ->unpack_function);
1855 }
1856
1857 #define SET_ORDER(o,r,g,b,a,x) G_STMT_START { \
1858 (o)->R = (r); \
1859 (o)->G = (g); \
1860 (o)->B = (b); \
1861 (o)->A = (a); \
1862 (o)->X = (x); \
1863 } G_STMT_END
1864
1865 static void
cuda_converter_get_rgb_order(GstVideoFormat format,GstCudaRGBOrder * order)1866 cuda_converter_get_rgb_order (GstVideoFormat format, GstCudaRGBOrder * order)
1867 {
1868 switch (format) {
1869 case GST_VIDEO_FORMAT_RGBA:
1870 SET_ORDER (order, 0, 1, 2, 3, -1);
1871 break;
1872 case GST_VIDEO_FORMAT_RGBx:
1873 SET_ORDER (order, 0, 1, 2, -1, 3);
1874 break;
1875 case GST_VIDEO_FORMAT_BGRA:
1876 SET_ORDER (order, 2, 1, 0, 3, -1);
1877 break;
1878 case GST_VIDEO_FORMAT_BGRx:
1879 SET_ORDER (order, 2, 1, 0, -1, 3);
1880 break;
1881 case GST_VIDEO_FORMAT_ARGB:
1882 SET_ORDER (order, 1, 2, 3, 0, -1);
1883 break;
1884 case GST_VIDEO_FORMAT_ABGR:
1885 SET_ORDER (order, 3, 2, 1, 0, -1);
1886 break;
1887 case GST_VIDEO_FORMAT_RGB:
1888 SET_ORDER (order, 0, 1, 2, -1, -1);
1889 break;
1890 case GST_VIDEO_FORMAT_BGR:
1891 SET_ORDER (order, 2, 1, 0, -1, -1);
1892 break;
1893 case GST_VIDEO_FORMAT_BGR10A2_LE:
1894 SET_ORDER (order, 1, 2, 3, 0, -1);
1895 break;
1896 case GST_VIDEO_FORMAT_RGB10A2_LE:
1897 SET_ORDER (order, 3, 2, 1, 0, -1);
1898 break;
1899 default:
1900 g_assert_not_reached ();
1901 break;
1902 }
1903 }
1904
1905 static gboolean
cuda_converter_lookup_path(GstCudaConverter * convert)1906 cuda_converter_lookup_path (GstCudaConverter * convert)
1907 {
1908 GstVideoFormat in_format, out_format;
1909 gboolean src_yuv, dst_yuv;
1910 gboolean src_planar, dst_planar;
1911 GstCudaKernelTempl templ = { 0, };
1912 GstVideoInfo *in_info, *out_info;
1913 gboolean ret = FALSE;
1914 CUresult cuda_ret;
1915
1916 in_info = &convert->in_info;
1917 out_info = &convert->out_info;
1918
1919 in_format = GST_VIDEO_INFO_FORMAT (in_info);
1920 out_format = GST_VIDEO_INFO_FORMAT (out_info);
1921
1922 src_yuv = GST_VIDEO_INFO_IS_YUV (in_info);
1923 dst_yuv = GST_VIDEO_INFO_IS_YUV (out_info);
1924
1925 src_planar = GST_VIDEO_INFO_N_PLANES (in_info) ==
1926 GST_VIDEO_INFO_N_COMPONENTS (in_info);
1927 dst_planar = GST_VIDEO_INFO_N_PLANES (out_info) ==
1928 GST_VIDEO_INFO_N_COMPONENTS (out_info);
1929
1930 convert->keep_size = (GST_VIDEO_INFO_WIDTH (&convert->in_info) ==
1931 GST_VIDEO_INFO_WIDTH (&convert->out_info) &&
1932 GST_VIDEO_INFO_HEIGHT (&convert->in_info) ==
1933 GST_VIDEO_INFO_HEIGHT (&convert->out_info));
1934
1935 templ.scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 0) /
1936 (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 0);
1937 templ.scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 0) /
1938 (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0);
1939 templ.chroma_scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 1) /
1940 (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 1);
1941 templ.chroma_scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 1) /
1942 (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1);
1943 templ.width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 0);
1944 templ.height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0);
1945 templ.chroma_width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 1);
1946 templ.chroma_height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1);
1947
1948 templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0);
1949 templ.out_depth = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0);
1950 templ.pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 0);
1951 templ.chroma_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 1);
1952 templ.in_shift = in_info->finfo->shift[0];
1953 templ.out_shift = out_info->finfo->shift[0];
1954 templ.mask = ((1 << templ.out_depth) - 1) << templ.out_shift;
1955 templ.swap_uv = (is_uv_swapped (in_format) != is_uv_swapped (out_format));
1956
1957 if (src_yuv && dst_yuv) {
1958 convert->convert = convert_YUV_TO_YUV;
1959
1960 if (src_planar && dst_planar) {
1961 templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1962 templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
1963 } else if (!src_planar && dst_planar) {
1964 templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
1965 templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
1966 } else if (src_planar && !dst_planar) {
1967 templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1968 templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
1969 } else {
1970 templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
1971 templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
1972 }
1973
1974 convert->kernel_source =
1975 cuda_converter_generate_yuv_to_yuv_kernel_code (convert, &templ);
1976 convert->func_names[0] = GST_CUDA_KERNEL_FUNC;
1977
1978 ret = TRUE;
1979 } else if (src_yuv && !dst_yuv) {
1980 MatrixData matrix;
1981
1982 if (src_planar) {
1983 templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1984 } else {
1985 templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
1986 }
1987
1988 templ.max_in_val = (1 << templ.in_depth) - 1;
1989 cuda_converter_get_rgb_order (out_format, &templ.rgb_order);
1990
1991 cuda_converter_get_matrix (convert, &matrix, in_info, out_info);
1992 convert->kernel_source =
1993 cuda_converter_generate_yuv_to_rgb_kernel_code (convert,
1994 &templ, &matrix);
1995 convert->func_names[0] = GST_CUDA_KERNEL_FUNC;
1996
1997 convert->convert = convert_YUV_TO_RGB;
1998
1999 ret = TRUE;
2000 } else if (!src_yuv && dst_yuv) {
2001 MatrixData matrix;
2002 gsize element_size = 8;
2003 GstVideoFormat unpack_format;
2004 GstVideoFormat y444_format;
2005 GstVideoInfo unpack_info;
2006 GstVideoInfo y444_info;
2007 gint i;
2008
2009 if (dst_planar) {
2010 templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
2011 } else {
2012 templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
2013 }
2014 templ.read_chroma = READ_CHROMA_FROM_PLANAR;
2015
2016 cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order);
2017
2018 if (templ.in_depth > 8) {
2019 /* FIXME: RGB10A2_LE and BGR10A2_LE only */
2020 element_size = 16;
2021 unpack_format = GST_VIDEO_FORMAT_ARGB64;
2022 y444_format = GST_VIDEO_FORMAT_Y444_16LE;
2023 templ.unpack_function = unpack_to_ARGB64;
2024 } else {
2025 unpack_format = GST_VIDEO_FORMAT_ARGB;
2026 y444_format = GST_VIDEO_FORMAT_Y444;
2027 templ.unpack_function = unpack_to_ARGB;
2028 }
2029
2030 gst_video_info_set_format (&unpack_info,
2031 unpack_format, GST_VIDEO_INFO_WIDTH (in_info),
2032 GST_VIDEO_INFO_HEIGHT (in_info));
2033 gst_video_info_set_format (&y444_info,
2034 y444_format, GST_VIDEO_INFO_WIDTH (in_info),
2035 GST_VIDEO_INFO_HEIGHT (in_info));
2036
2037 templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0);
2038
2039 cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr,
2040 &convert->unpack_surface.cuda_stride,
2041 GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) *
2042 GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0),
2043 GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size);
2044
2045 if (!gst_cuda_result (cuda_ret)) {
2046 GST_ERROR ("couldn't alloc unpack surface");
2047 return FALSE;
2048 }
2049
2050 for (i = 0; i < 3; i++) {
2051 cuda_ret = CuMemAllocPitch (&convert->y444_surface[i].device_ptr,
2052 &convert->y444_surface[i].cuda_stride,
2053 GST_VIDEO_INFO_COMP_WIDTH (&y444_info, i) *
2054 GST_VIDEO_INFO_COMP_PSTRIDE (&y444_info, i),
2055 GST_VIDEO_INFO_COMP_HEIGHT (&y444_info, i), element_size);
2056
2057 if (!gst_cuda_result (cuda_ret)) {
2058 GST_ERROR ("couldn't alloc %dth y444 surface", i);
2059 return FALSE;
2060 }
2061 }
2062
2063 cuda_converter_get_matrix (convert, &matrix, &unpack_info, &y444_info);
2064
2065 convert->kernel_source =
2066 cuda_converter_generate_rgb_to_yuv_kernel_code (convert,
2067 &templ, &matrix);
2068
2069 convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB;
2070 convert->func_names[1] = GST_CUDA_KERNEL_FUNC_TO_Y444;
2071 convert->func_names[2] = GST_CUDA_KERNEL_FUNC_Y444_TO_YUV;
2072
2073 convert->convert = convert_RGB_TO_YUV;
2074
2075 ret = TRUE;
2076 } else {
2077 gsize element_size = 8;
2078 GstVideoFormat unpack_format;
2079 GstVideoInfo unpack_info;
2080
2081 cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order);
2082 cuda_converter_get_rgb_order (out_format, &templ.rgb_order);
2083
2084 if (templ.in_depth > 8) {
2085 /* FIXME: RGB10A2_LE and BGR10A2_LE only */
2086 element_size = 16;
2087 unpack_format = GST_VIDEO_FORMAT_ARGB64;
2088 templ.unpack_function = unpack_to_ARGB64;
2089 } else {
2090 unpack_format = GST_VIDEO_FORMAT_ARGB;
2091 templ.unpack_function = unpack_to_ARGB;
2092 }
2093
2094 gst_video_info_set_format (&unpack_info,
2095 unpack_format, GST_VIDEO_INFO_WIDTH (in_info),
2096 GST_VIDEO_INFO_HEIGHT (in_info));
2097
2098 templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0);
2099
2100 cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr,
2101 &convert->unpack_surface.cuda_stride,
2102 GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) *
2103 GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0),
2104 GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size);
2105
2106 if (!gst_cuda_result (cuda_ret)) {
2107 GST_ERROR ("couldn't alloc unpack surface");
2108 return FALSE;
2109 }
2110
2111 convert->kernel_source =
2112 cuda_converter_generate_rgb_to_rgb_kernel_code (convert, &templ);
2113
2114 convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB;
2115 convert->func_names[1] = GST_CUDA_KERNEL_FUNC_SCALE_RGB;
2116
2117 convert->convert = convert_RGB_TO_RGB;
2118
2119 ret = TRUE;
2120 }
2121
2122 if (!ret) {
2123 GST_DEBUG ("no path found");
2124
2125 return FALSE;
2126 }
2127
2128 GST_TRACE ("configured CUDA kernel source\n%s", convert->kernel_source);
2129
2130 return TRUE;
2131 }
2132