1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "common.h"
17 
18 static char hash_table_kernel[] =
19     "#if 0\n"
20     "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
21     "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
22     "#endif\n"
23     "typedef struct BinNode {\n"
24     " int value;\n"
25     " atomic_uintptr_t pNext;\n"
26     "} BinNode;\n"
27 
28     "__kernel void build_hash_table(__global uint* input, __global BinNode* "
29     "pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n"
30     "{\n"
31     " __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, "
32     "1u, memory_order_relaxed, memory_scope_all_svm_devices) ];\n"
33     " uint i = get_global_id(0);\n"
34     " uint b = input[i] % numBins;\n"
35     " pNew->value = input[i];\n"
36     " uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), "
37     "memory_order_seq_cst, memory_scope_all_svm_devices);\n"
38     " do\n"
39     " {\n"
40     "   atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, "
41     "memory_scope_all_svm_devices);\n" // always inserting at head of list
42     " } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), "
43     "&next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, "
44     "memory_scope_all_svm_devices));\n"
45     "}\n";
46 
47 typedef struct BinNode{
48   cl_uint value;
49   struct BinNode* pNext;
50 } BinNode;
51 
build_hash_table_on_host(cl_context c,cl_uint * input,size_t inputSize,BinNode * pNodes,cl_int volatile * pNumNodes,cl_uint numBins)52 void build_hash_table_on_host(cl_context c, cl_uint* input, size_t inputSize, BinNode* pNodes, cl_int volatile *pNumNodes, cl_uint numBins)
53 {
54   for(cl_uint i = 0; i < inputSize; i++)
55   {
56     BinNode *pNew = &pNodes[ AtomicFetchAddExplicit(pNumNodes, 1, memory_order_relaxed) ];
57     cl_uint b = input[i] % numBins;
58     pNew->value = input[i];
59 
60     BinNode *next = pNodes[b].pNext;
61     do {
62         pNew->pNext = next;  // always inserting at head of list
63     } while(!AtomicCompareExchangeStrongExplicit(&(pNodes[b].pNext), &next, pNew, memory_order_relaxed, memory_order_seq_cst));
64   }
65 }
66 
67 
launch_kernels_and_verify(clContextWrapper & context,clCommandQueueWrapper * queues,clKernelWrapper & kernel,cl_uint num_devices,cl_uint numBins,size_t num_pixels)68 int launch_kernels_and_verify(clContextWrapper &context, clCommandQueueWrapper* queues, clKernelWrapper &kernel, cl_uint num_devices, cl_uint numBins, size_t num_pixels)
69 {
70   int err = CL_SUCCESS;
71   cl_uint *pInputImage = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_ONLY  | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint) * num_pixels, 0);
72   BinNode *pNodes      = (BinNode*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(BinNode) * (num_pixels * (num_devices + 1) + numBins), 0);
73   cl_int *pNumNodes       = (cl_int*)  clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_int), 0);
74 
75   *pNumNodes = numBins;  // using the first numBins nodes to hold the list heads.
76   for(cl_uint i=0;i<numBins;i++) {
77     pNodes[i].pNext = NULL;
78   };
79 
80   for(cl_uint i=0; i < num_pixels; i++) pInputImage[i] = i;
81 
82   err |= clSetKernelArgSVMPointer(kernel, 0, pInputImage);
83   err |= clSetKernelArgSVMPointer(kernel, 1, pNodes);
84   err |= clSetKernelArgSVMPointer(kernel, 2, pNumNodes);
85   err |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &numBins);
86 
87   test_error(err, "clSetKernelArg failed");
88 
89   cl_event done;
90   // get all the devices going simultaneously, each device (and the host) will insert all the pixels.
91   for(cl_uint d=0; d<num_devices; d++)
92   {
93     err = clEnqueueNDRangeKernel(queues[d], kernel, 1, NULL, &num_pixels, 0, 0, NULL, &done);
94     test_error(err,"clEnqueueNDRangeKernel failed");
95   }
96   for(cl_uint d=0; d<num_devices; d++) clFlush(queues[d]);
97 
98   // wait until we see some activity from a device (try to run host side simultaneously).
99   while(numBins == AtomicLoadExplicit(pNumNodes, memory_order_relaxed));
100 
101   build_hash_table_on_host(context, pInputImage, num_pixels, pNodes, pNumNodes, numBins);
102 
103   for(cl_uint d=0; d<num_devices; d++) clFinish(queues[d]);
104 
105   cl_uint num_items = 0;
106   // check correctness of each bin in the hash table.
107   for(cl_uint i = 0; i < numBins; i++)
108   {
109     BinNode *pNode = pNodes[i].pNext;
110     while(pNode)
111     {
112       if((pNode->value % numBins) != i)
113       {
114         log_error("Something went wrong, item is in wrong hash bucket\n");
115         break;
116       }
117       num_items++;
118       pNode = pNode->pNext;
119     }
120   }
121 
122   clReleaseEvent(done);
123   clSVMFree(context, pInputImage);
124   clSVMFree(context, pNodes);
125   clSVMFree(context, pNumNodes);
126   // each device and the host inserted all of the pixels, check that none are missing.
127   if(num_items != num_pixels * (num_devices + 1) )
128   {
129     log_error("The hash table is not correct, num items %d, expected num items: %d\n", num_items, num_pixels * (num_devices + 1));
130     return -1; // test did not pass
131   }
132   return 0;
133 }
134 
135 // This tests for memory consistency across devices and the host.
136 // Each device and the host simultaneously insert values into a single hash table.
137 // Each bin in the hash table is a linked list.  Each bin is protected against simultaneous
138 // update using a lock free technique.  The correctness of the list is verfied on the host.
139 // This test requires the new OpenCL 2.0 atomic operations that implement the new seq_cst memory ordering.
test_svm_fine_grain_memory_consistency(cl_device_id deviceID,cl_context c,cl_command_queue queue,int num_elements)140 int test_svm_fine_grain_memory_consistency(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
141 {
142   clContextWrapper context;
143   clProgramWrapper program;
144   clKernelWrapper kernel;
145   clCommandQueueWrapper queues[MAXQ];
146 
147   cl_uint     num_devices = 0;
148   cl_int      err = CL_SUCCESS;
149   std::vector<std::string> required_extensions;
150   required_extensions.push_back("cl_khr_int64_base_atomics");
151   required_extensions.push_back("cl_khr_int64_extended_atomics");
152 
153   // Make pragmas visible for 64-bit addresses
154   hash_table_kernel[4] = sizeof(void *) == 8 ? '1' : '0';
155 
156   char *source[] = { hash_table_kernel };
157 
158   err = create_cl_objects(deviceID, (const char**)source, &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS, required_extensions);
159   if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
160   if(err < 0) return -1; // fail test.
161 
162   kernel = clCreateKernel(program, "build_hash_table", &err);
163   test_error(err, "clCreateKernel failed");
164   size_t num_pixels = num_elements;
165 
166   int result;
167   cl_uint numBins = 1;  // all work groups in all devices and the host code will hammer on this one lock.
168   result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
169   if(result == -1) return result;
170 
171   numBins = 2;  // 2 locks within in same cache line will get hit from different devices and host.
172   result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
173   if(result == -1) return result;
174 
175   numBins = 29; // locks span a few cache lines.
176   result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
177   if(result == -1) return result;
178 
179   return result;
180 }
181