• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html.
4
5// Copyright (C) 2014, Itseez, Inc., all rights reserved.
6// Third party copyrights are property of their respective owners.
7
8#ifdef DOUBLE_SUPPORT
9#ifdef cl_amd_fp64
10#pragma OPENCL EXTENSION cl_amd_fp64:enable
11#elif defined (cl_khr_fp64)
12#pragma OPENCL EXTENSION cl_khr_fp64:enable
13#endif
14#endif
15
16#define TSIZE  (int)sizeof(T)
17#define WTSIZE (int)sizeof(WT)
18
19#define IND_A mad24(y, A_step, A_offset)
20#define IND_B mad24(x, WTSIZE, B_offset)
21#define STEP_B B_step / WTSIZE
22
23#define LOCAL_SIZE_ODD (LOCAL_SIZE + 1)
24
25#if cn==2
26#if kercn==2
27#define MUL(a, b)\
28    {\
29    sum.x += fma(a.x, b.x, - a.y * b.y);\
30    sum.y += fma(a.x, b.y, a.y * b.x);\
31    }
32#else
33#define MUL(a, b)\
34    {\
35    sum.x += fma(a.x, b.x, - a.y * b.y);\
36    sum.y += fma(a.x, b.y, a.y * b.x);\
37    sum.z += fma(a.x, b.z, - a.y * b.w);\
38    sum.w += fma(a.x, b.w, a.y * b.z);\
39    }
40#endif
41#else
42#define MUL(a, b) sum = fma(a, b, sum);
43#endif
44
45
46__kernel void gemm(__global const uchar * A_ptr, int A_step, int A_offset,
47                   __global const uchar * B_ptr, int B_step, int B_offset,
48                   __global uchar * D_ptr, int D_step, int D_offset, int D_rows, int D_cols,
49                   int n, T1 alpha, T1 beta)
50{
51    int x = get_global_id(0);
52    int y = get_global_id(1);
53
54    int lidx = get_local_id(0);
55    int lidy = get_local_id(1);
56
57    __global const T* A = (__global const T*)(A_ptr + IND_A);
58    __global const WT* B = (__global const WT*)(B_ptr + IND_B);
59
60    WT sum = (WT)(0);
61
62#if LOCAL_SIZE == 1
63
64    if (x < D_cols && y < D_rows)
65    {
66        for (int i = 0; i < n; ++i)
67            MUL(A[i], B[i*STEP_B]);
68#else
69
70    __local T  a_local[LOCAL_SIZE_ODD*LOCAL_SIZE];
71    __local WT b_local[LOCAL_SIZE_ODD*LOCAL_SIZE];
72
73    int reps;
74#if NO_MULT
75    reps = (n + LOCAL_SIZE-1)/LOCAL_SIZE;
76#else
77    reps = n/LOCAL_SIZE;
78#endif
79
80    for (int p = 0; p < reps; ++p)
81    {
82        if (p * LOCAL_SIZE + lidx < n && y < D_rows)
83            a_local[mad24(lidy, LOCAL_SIZE_ODD, lidx)] = A[mad24(p, LOCAL_SIZE, lidx)];
84        if (p * LOCAL_SIZE + lidy < n && x < D_cols)
85            b_local[mad24(lidy, LOCAL_SIZE_ODD, lidx)] = B[mad24(p, LOCAL_SIZE, lidy)*STEP_B];
86
87        barrier(CLK_LOCAL_MEM_FENCE);
88
89        if (x < D_cols && y < D_rows)
90        {
91#if NO_MULT
92            int ie = min(LOCAL_SIZE, n - p * LOCAL_SIZE);
93            for (int i = 0; i < ie; ++i)
94#else
95            for (int i = 0; i < LOCAL_SIZE; ++i)
96#endif
97                MUL(a_local[mad24(lidy, LOCAL_SIZE_ODD, i)], b_local[mad24(i, LOCAL_SIZE_ODD, lidx)]);
98        }
99        barrier(CLK_LOCAL_MEM_FENCE);
100    }
101
102    if (x < D_cols && y < D_rows)
103    {
104#endif
105        __global WT* D = (__global WT*)(D_ptr + mad24(y, D_step, mad24(x, WTSIZE, D_offset)));
106#if HAVE_C
107        D[0] = mad(alpha, sum, D[0]*beta);
108#else
109        D[0] = alpha * sum;
110#endif
111    }
112}