1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "common.h"
17
18 static char hash_table_kernel[] =
19 "#if 0\n"
20 "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
21 "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
22 "#endif\n"
23 "typedef struct BinNode {\n"
24 " int value;\n"
25 " atomic_uintptr_t pNext;\n"
26 "} BinNode;\n"
27
28 "__kernel void build_hash_table(__global uint* input, __global BinNode* "
29 "pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n"
30 "{\n"
31 " __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, "
32 "1u, memory_order_relaxed, memory_scope_all_svm_devices) ];\n"
33 " uint i = get_global_id(0);\n"
34 " uint b = input[i] % numBins;\n"
35 " pNew->value = input[i];\n"
36 " uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), "
37 "memory_order_seq_cst, memory_scope_all_svm_devices);\n"
38 " do\n"
39 " {\n"
40 " atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, "
41 "memory_scope_all_svm_devices);\n" // always inserting at head of list
42 " } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), "
43 "&next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, "
44 "memory_scope_all_svm_devices));\n"
45 "}\n";
46
47 typedef struct BinNode{
48 cl_uint value;
49 struct BinNode* pNext;
50 } BinNode;
51
build_hash_table_on_host(cl_context c,cl_uint * input,size_t inputSize,BinNode * pNodes,cl_int volatile * pNumNodes,cl_uint numBins)52 void build_hash_table_on_host(cl_context c, cl_uint* input, size_t inputSize, BinNode* pNodes, cl_int volatile *pNumNodes, cl_uint numBins)
53 {
54 for(cl_uint i = 0; i < inputSize; i++)
55 {
56 BinNode *pNew = &pNodes[ AtomicFetchAddExplicit(pNumNodes, 1, memory_order_relaxed) ];
57 cl_uint b = input[i] % numBins;
58 pNew->value = input[i];
59
60 BinNode *next = pNodes[b].pNext;
61 do {
62 pNew->pNext = next; // always inserting at head of list
63 } while(!AtomicCompareExchangeStrongExplicit(&(pNodes[b].pNext), &next, pNew, memory_order_relaxed, memory_order_seq_cst));
64 }
65 }
66
67
launch_kernels_and_verify(clContextWrapper & context,clCommandQueueWrapper * queues,clKernelWrapper & kernel,cl_uint num_devices,cl_uint numBins,size_t num_pixels)68 int launch_kernels_and_verify(clContextWrapper &context, clCommandQueueWrapper* queues, clKernelWrapper &kernel, cl_uint num_devices, cl_uint numBins, size_t num_pixels)
69 {
70 int err = CL_SUCCESS;
71 cl_uint *pInputImage = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint) * num_pixels, 0);
72 BinNode *pNodes = (BinNode*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(BinNode) * (num_pixels * (num_devices + 1) + numBins), 0);
73 cl_int *pNumNodes = (cl_int*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_int), 0);
74
75 *pNumNodes = numBins; // using the first numBins nodes to hold the list heads.
76 for(cl_uint i=0;i<numBins;i++) {
77 pNodes[i].pNext = NULL;
78 };
79
80 for(cl_uint i=0; i < num_pixels; i++) pInputImage[i] = i;
81
82 err |= clSetKernelArgSVMPointer(kernel, 0, pInputImage);
83 err |= clSetKernelArgSVMPointer(kernel, 1, pNodes);
84 err |= clSetKernelArgSVMPointer(kernel, 2, pNumNodes);
85 err |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &numBins);
86
87 test_error(err, "clSetKernelArg failed");
88
89 cl_event done;
90 // get all the devices going simultaneously, each device (and the host) will insert all the pixels.
91 for(cl_uint d=0; d<num_devices; d++)
92 {
93 err = clEnqueueNDRangeKernel(queues[d], kernel, 1, NULL, &num_pixels, 0, 0, NULL, &done);
94 test_error(err,"clEnqueueNDRangeKernel failed");
95 }
96 for(cl_uint d=0; d<num_devices; d++) clFlush(queues[d]);
97
98 // wait until we see some activity from a device (try to run host side simultaneously).
99 while(numBins == AtomicLoadExplicit(pNumNodes, memory_order_relaxed));
100
101 build_hash_table_on_host(context, pInputImage, num_pixels, pNodes, pNumNodes, numBins);
102
103 for(cl_uint d=0; d<num_devices; d++) clFinish(queues[d]);
104
105 cl_uint num_items = 0;
106 // check correctness of each bin in the hash table.
107 for(cl_uint i = 0; i < numBins; i++)
108 {
109 BinNode *pNode = pNodes[i].pNext;
110 while(pNode)
111 {
112 if((pNode->value % numBins) != i)
113 {
114 log_error("Something went wrong, item is in wrong hash bucket\n");
115 break;
116 }
117 num_items++;
118 pNode = pNode->pNext;
119 }
120 }
121
122 clReleaseEvent(done);
123 clSVMFree(context, pInputImage);
124 clSVMFree(context, pNodes);
125 clSVMFree(context, pNumNodes);
126 // each device and the host inserted all of the pixels, check that none are missing.
127 if(num_items != num_pixels * (num_devices + 1) )
128 {
129 log_error("The hash table is not correct, num items %d, expected num items: %d\n", num_items, num_pixels * (num_devices + 1));
130 return -1; // test did not pass
131 }
132 return 0;
133 }
134
135 // This tests for memory consistency across devices and the host.
136 // Each device and the host simultaneously insert values into a single hash table.
137 // Each bin in the hash table is a linked list. Each bin is protected against simultaneous
138 // update using a lock free technique. The correctness of the list is verfied on the host.
139 // This test requires the new OpenCL 2.0 atomic operations that implement the new seq_cst memory ordering.
test_svm_fine_grain_memory_consistency(cl_device_id deviceID,cl_context c,cl_command_queue queue,int num_elements)140 int test_svm_fine_grain_memory_consistency(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
141 {
142 clContextWrapper context;
143 clProgramWrapper program;
144 clKernelWrapper kernel;
145 clCommandQueueWrapper queues[MAXQ];
146
147 cl_uint num_devices = 0;
148 cl_int err = CL_SUCCESS;
149 std::vector<std::string> required_extensions;
150 required_extensions.push_back("cl_khr_int64_base_atomics");
151 required_extensions.push_back("cl_khr_int64_extended_atomics");
152
153 // Make pragmas visible for 64-bit addresses
154 hash_table_kernel[4] = sizeof(void *) == 8 ? '1' : '0';
155
156 char *source[] = { hash_table_kernel };
157
158 err = create_cl_objects(deviceID, (const char**)source, &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS, required_extensions);
159 if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
160 if(err < 0) return -1; // fail test.
161
162 kernel = clCreateKernel(program, "build_hash_table", &err);
163 test_error(err, "clCreateKernel failed");
164 size_t num_pixels = num_elements;
165
166 int result;
167 cl_uint numBins = 1; // all work groups in all devices and the host code will hammer on this one lock.
168 result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
169 if(result == -1) return result;
170
171 numBins = 2; // 2 locks within in same cache line will get hit from different devices and host.
172 result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
173 if(result == -1) return result;
174
175 numBins = 29; // locks span a few cache lines.
176 result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
177 if(result == -1) return result;
178
179 return result;
180 }
181