• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "ThreadPool.h"
17 #include "errorHelpers.h"
18 #include "fpcontrol.h"
19 #include <stdio.h>
20 #include <stdlib.h>
21 
22 #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
23 // or any other POSIX system
24 
25 #if defined(_WIN32)
26 #include <windows.h>
27 #if defined(_MSC_VER)
28 #include <intrin.h>
29 #endif
30 #include "mingw_compat.h"
31 #include <process.h>
32 #else // !_WIN32
33 #include <pthread.h>
34 #include <unistd.h>
35 #include <sys/errno.h>
36 #ifdef __linux__
37 #include <sched.h>
38 #endif
39 #endif // !_WIN32
40 
41 // declarations
42 #ifdef _WIN32
43 void ThreadPool_WorkerFunc(void *p);
44 #else
45 void *ThreadPool_WorkerFunc(void *p);
46 #endif
47 void ThreadPool_Init(void);
48 void ThreadPool_Exit(void);
49 
50 #if defined(__MINGW32__)
51 // Mutex for implementing super heavy atomic operations if you don't have GCC or
52 // MSVC
53 CRITICAL_SECTION gAtomicLock;
54 #elif defined(__GNUC__) || defined(_MSC_VER)
55 #else
56 pthread_mutex_t gAtomicLock;
57 #endif
58 
59 // Atomic add operator with mem barrier.  Mem barrier needed to protect state
60 // modified by the worker functions.
ThreadPool_AtomicAdd(volatile cl_int * a,cl_int b)61 cl_int ThreadPool_AtomicAdd(volatile cl_int *a, cl_int b)
62 {
63 #if defined(__MINGW32__)
64     // No atomics on Mingw32
65     EnterCriticalSection(&gAtomicLock);
66     cl_int old = *a;
67     *a = old + b;
68     LeaveCriticalSection(&gAtomicLock);
69     return old;
70 #elif defined(__GNUC__)
71     // GCC extension:
72     // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
73     return __sync_fetch_and_add(a, b);
74     // do we need __sync_synchronize() here, too?  GCC docs are unclear whether
75     // __sync_fetch_and_add does a synchronize
76 #elif defined(_MSC_VER)
77     return (cl_int)_InterlockedExchangeAdd((volatile LONG *)a, (LONG)b);
78 #else
79 #warning Please add a atomic add implementation here, with memory barrier.  Fallback code is slow.
80     if (pthread_mutex_lock(&gAtomicLock))
81         log_error("Atomic operation failed. pthread_mutex_lock(&gAtomicLock) "
82                   "returned an error\n");
83     cl_int old = *a;
84     *a = old + b;
85     if (pthread_mutex_unlock(&gAtomicLock))
86         log_error("Failed to release gAtomicLock. Further atomic operations "
87                   "may deadlock!\n");
88     return old;
89 #endif
90 }
91 
92 #if defined(_WIN32)
93 // Uncomment the following line if Windows XP support is not required.
94 // #define HAS_INIT_ONCE_EXECUTE_ONCE 1
95 
96 #if defined(HAS_INIT_ONCE_EXECUTE_ONCE)
97 #define _INIT_ONCE INIT_ONCE
98 #define _PINIT_ONCE PINIT_ONCE
99 #define _InitOnceExecuteOnce InitOnceExecuteOnce
100 #else // !HAS_INIT_ONCE_EXECUTE_ONCE
101 
102 typedef volatile LONG _INIT_ONCE;
103 typedef _INIT_ONCE *_PINIT_ONCE;
104 typedef BOOL(CALLBACK *_PINIT_ONCE_FN)(_PINIT_ONCE, PVOID, PVOID *);
105 
106 #define _INIT_ONCE_UNINITIALIZED 0
107 #define _INIT_ONCE_IN_PROGRESS 1
108 #define _INIT_ONCE_DONE 2
109 
_InitOnceExecuteOnce(_PINIT_ONCE InitOnce,_PINIT_ONCE_FN InitFn,PVOID Parameter,LPVOID * Context)110 static BOOL _InitOnceExecuteOnce(_PINIT_ONCE InitOnce, _PINIT_ONCE_FN InitFn,
111                                  PVOID Parameter, LPVOID *Context)
112 {
113     while (*InitOnce != _INIT_ONCE_DONE)
114     {
115         if (*InitOnce != _INIT_ONCE_IN_PROGRESS
116             && _InterlockedCompareExchange(InitOnce, _INIT_ONCE_IN_PROGRESS,
117                                            _INIT_ONCE_UNINITIALIZED)
118                 == _INIT_ONCE_UNINITIALIZED)
119         {
120             InitFn(InitOnce, Parameter, Context);
121             *InitOnce = _INIT_ONCE_DONE;
122             return TRUE;
123         }
124         Sleep(1);
125     }
126     return TRUE;
127 }
128 #endif // !HAS_INIT_ONCE_EXECUTE_ONCE
129 
130 // Uncomment the following line if Windows XP support is not required.
131 // #define HAS_CONDITION_VARIABLE 1
132 
133 #if defined(HAS_CONDITION_VARIABLE)
134 #define _CONDITION_VARIABLE CONDITION_VARIABLE
135 #define _InitializeConditionVariable InitializeConditionVariable
136 #define _SleepConditionVariableCS SleepConditionVariableCS
137 #define _WakeAllConditionVariable WakeAllConditionVariable
138 #else // !HAS_CONDITION_VARIABLE
139 typedef struct
140 {
141     HANDLE mEvent; // Used to park the thread.
142     // Used to protect mWaiters, mGeneration and mReleaseCount:
143     CRITICAL_SECTION mLock[1];
144     volatile cl_int mWaiters; // Number of threads waiting on this cond var.
145     volatile cl_int mGeneration; // Wait generation count.
146     volatile cl_int mReleaseCount; // Number of releases to execute before
147                                    // reseting the event.
148 } _CONDITION_VARIABLE;
149 
150 typedef _CONDITION_VARIABLE *_PCONDITION_VARIABLE;
151 
_InitializeConditionVariable(_PCONDITION_VARIABLE cond_var)152 static void _InitializeConditionVariable(_PCONDITION_VARIABLE cond_var)
153 {
154     cond_var->mEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
155     InitializeCriticalSection(cond_var->mLock);
156     cond_var->mWaiters = 0;
157     cond_var->mGeneration = 0;
158 #if !defined(NDEBUG)
159     cond_var->mReleaseCount = 0;
160 #endif // !NDEBUG
161 }
162 
_SleepConditionVariableCS(_PCONDITION_VARIABLE cond_var,PCRITICAL_SECTION cond_lock,DWORD ignored)163 static void _SleepConditionVariableCS(_PCONDITION_VARIABLE cond_var,
164                                       PCRITICAL_SECTION cond_lock,
165                                       DWORD ignored)
166 {
167     EnterCriticalSection(cond_var->mLock);
168     cl_int generation = cond_var->mGeneration;
169     ++cond_var->mWaiters;
170     LeaveCriticalSection(cond_var->mLock);
171     LeaveCriticalSection(cond_lock);
172 
173     while (TRUE)
174     {
175         WaitForSingleObject(cond_var->mEvent, INFINITE);
176         EnterCriticalSection(cond_var->mLock);
177         BOOL done =
178             cond_var->mReleaseCount > 0 && cond_var->mGeneration != generation;
179         LeaveCriticalSection(cond_var->mLock);
180         if (done)
181         {
182             break;
183         }
184     }
185 
186     EnterCriticalSection(cond_lock);
187     EnterCriticalSection(cond_var->mLock);
188     if (--cond_var->mReleaseCount == 0)
189     {
190         ResetEvent(cond_var->mEvent);
191     }
192     --cond_var->mWaiters;
193     LeaveCriticalSection(cond_var->mLock);
194 }
195 
_WakeAllConditionVariable(_PCONDITION_VARIABLE cond_var)196 static void _WakeAllConditionVariable(_PCONDITION_VARIABLE cond_var)
197 {
198     EnterCriticalSection(cond_var->mLock);
199     if (cond_var->mWaiters > 0)
200     {
201         ++cond_var->mGeneration;
202         cond_var->mReleaseCount = cond_var->mWaiters;
203         SetEvent(cond_var->mEvent);
204     }
205     LeaveCriticalSection(cond_var->mLock);
206 }
207 #endif // !HAS_CONDITION_VARIABLE
208 #endif // _WIN32
209 
210 #define MAX_COUNT (1 << 29)
211 
212 // Global state to coordinate whether the threads have been launched
213 // successfully or not
214 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
215 static _INIT_ONCE threadpool_init_control;
216 #elif defined(_WIN32) // MingW of XP
217 static int threadpool_init_control;
218 #else // Posix platforms
219 pthread_once_t threadpool_init_control = PTHREAD_ONCE_INIT;
220 #endif
221 cl_int threadPoolInitErr = -1; // set to CL_SUCCESS on successful thread launch
222 
223 // critical region lock around ThreadPool_Do.  We can only run one ThreadPool_Do
224 // at a time, because we are too lazy to set up a queue here, and don't expect
225 // to need one.
226 #if defined(_WIN32)
227 CRITICAL_SECTION gThreadPoolLock[1];
228 #else // !_WIN32
229 pthread_mutex_t gThreadPoolLock;
230 #endif // !_WIN32
231 
232 // Condition variable to park ThreadPool threads when not working
233 #if defined(_WIN32)
234 CRITICAL_SECTION cond_lock[1];
235 _CONDITION_VARIABLE cond_var[1];
236 #else // !_WIN32
237 pthread_mutex_t cond_lock;
238 pthread_cond_t cond_var;
239 #endif // !_WIN32
240 
241 // Condition variable state. How many iterations on the function left to run,
242 // set to CL_INT_MAX to cause worker threads to exit. Note: this value might
243 // go negative.
244 volatile cl_int gRunCount = 0;
245 
246 // State that only changes when the threadpool is not working.
247 volatile TPFuncPtr gFunc_ptr = NULL;
248 volatile void *gUserInfo = NULL;
249 volatile cl_int gJobCount = 0;
250 
251 // State that may change while the thread pool is working
252 volatile cl_int jobError = CL_SUCCESS; // err code return for the job as a whole
253 
254 // Condition variable to park caller while waiting
255 #if defined(_WIN32)
256 HANDLE caller_event;
257 #else // !_WIN32
258 pthread_mutex_t caller_cond_lock;
259 pthread_cond_t caller_cond_var;
260 #endif // !_WIN32
261 
262 // # of threads intended to be running. Running threads will decrement this
263 // as they discover they've run out of work to do.
264 volatile cl_int gRunning = 0;
265 
266 // The total number of threads launched.
267 volatile cl_int gThreadCount = 0;
268 #ifdef _WIN32
ThreadPool_WorkerFunc(void * p)269 void ThreadPool_WorkerFunc(void *p)
270 #else
271 void *ThreadPool_WorkerFunc(void *p)
272 #endif
273 {
274     cl_uint threadID = ThreadPool_AtomicAdd((volatile cl_int *)p, 1);
275     cl_int item = ThreadPool_AtomicAdd(&gRunCount, -1);
276     // log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning );
277 
278     while (MAX_COUNT > item)
279     {
280         cl_int err;
281 
282         // check for more work to do
283         if (0 >= item)
284         {
285             // log_info("Thread %d has run out of work.\n", threadID);
286 
287             // No work to do. Attempt to block waiting for work
288 #if defined(_WIN32)
289             EnterCriticalSection(cond_lock);
290 #else // !_WIN32
291             if ((err = pthread_mutex_lock(&cond_lock)))
292             {
293                 log_error(
294                     "Error %d from pthread_mutex_lock. Worker %d unable to "
295                     "block waiting for work. ThreadPool_WorkerFunc failed.\n",
296                     err, threadID);
297                 goto exit;
298             }
299 #endif // !_WIN32
300 
301             cl_int remaining = ThreadPool_AtomicAdd(&gRunning, -1);
302             // log_info("ThreadPool_WorkerFunc: gRunning = %d\n",
303             //          remaining - 1);
304             if (1 == remaining)
305             { // last thread out signal the main thread to wake up
306 #if defined(_WIN32)
307                 SetEvent(caller_event);
308 #else // !_WIN32
309                 if ((err = pthread_mutex_lock(&caller_cond_lock)))
310                 {
311                     log_error("Error %d from pthread_mutex_lock. Unable to "
312                               "wake caller.\n",
313                               err);
314                     goto exit;
315                 }
316                 if ((err = pthread_cond_broadcast(&caller_cond_var)))
317                 {
318                     log_error(
319                         "Error %d from pthread_cond_broadcast. Unable to wake "
320                         "up main thread. ThreadPool_WorkerFunc failed.\n",
321                         err);
322                     goto exit;
323                 }
324                 if ((err = pthread_mutex_unlock(&caller_cond_lock)))
325                 {
326                     log_error("Error %d from pthread_mutex_lock. Unable to "
327                               "wake caller.\n",
328                               err);
329                     goto exit;
330                 }
331 #endif // !_WIN32
332             }
333 
334             // loop in case we are woken only to discover that some other thread
335             // already did all the work
336             while (0 >= item)
337             {
338 #if defined(_WIN32)
339                 _SleepConditionVariableCS(cond_var, cond_lock, INFINITE);
340 #else // !_WIN32
341                 if ((err = pthread_cond_wait(&cond_var, &cond_lock)))
342                 {
343                     log_error(
344                         "Error %d from pthread_cond_wait. Unable to block for "
345                         "waiting for work. ThreadPool_WorkerFunc failed.\n",
346                         err);
347                     pthread_mutex_unlock(&cond_lock);
348                     goto exit;
349                 }
350 #endif // !_WIN32
351 
352                 // try again to get a valid item id
353                 item = ThreadPool_AtomicAdd(&gRunCount, -1);
354                 if (MAX_COUNT <= item) // exit if we are done
355                 {
356 #if defined(_WIN32)
357                     LeaveCriticalSection(cond_lock);
358 #else // !_WIN32
359                     pthread_mutex_unlock(&cond_lock);
360 #endif // !_WIN32
361                     goto exit;
362                 }
363             }
364 
365             ThreadPool_AtomicAdd(&gRunning, 1);
366             // log_info("Thread %d has found work.\n", threadID);
367 
368 #if defined(_WIN32)
369             LeaveCriticalSection(cond_lock);
370 #else // !_WIN32
371             if ((err = pthread_mutex_unlock(&cond_lock)))
372             {
373                 log_error(
374                     "Error %d from pthread_mutex_unlock. Unable to block for "
375                     "waiting for work. ThreadPool_WorkerFunc failed.\n",
376                     err);
377                 goto exit;
378             }
379 #endif // !_WIN32
380         }
381 
382         // we have a valid item, so do the work
383         // but only if we haven't already encountered an error
384         if (CL_SUCCESS == jobError)
385         {
386             // log_info("Thread %d doing job %d\n", threadID, item - 1);
387 
388 #if defined(__APPLE__) && defined(__arm__)
389             // On most platforms which support denorm, default is FTZ off.
390             // However, on some hardware where the reference is computed,
391             // default might be flush denorms to zero e.g. arm. This creates
392             // issues in result verification. Since spec allows the
393             // implementation to either flush or not flush denorms to zero, an
394             // implementation may choose not be flush i.e. return denorm result
395             // whereas reference result may be zero (flushed denorm). Hence we
396             // need to disable denorm flushing on host side where reference is
397             // being computed to make sure we get non-flushed reference result.
398             // If implementation returns flushed result, we correctly take care
399             // of that in verification code.
400             FPU_mode_type oldMode;
401             DisableFTZ(&oldMode);
402 #endif
403 
404             // Call the user's function with this item ID
405             err = gFunc_ptr(item - 1, threadID, (void *)gUserInfo);
406 #if defined(__APPLE__) && defined(__arm__)
407             // Restore FP state
408             RestoreFPState(&oldMode);
409 #endif
410 
411             if (err)
412             {
413 #if (__MINGW32__)
414                 EnterCriticalSection(&gAtomicLock);
415                 if (jobError == CL_SUCCESS) jobError = err;
416                 gRunCount = 0;
417                 LeaveCriticalSection(&gAtomicLock);
418 #elif defined(__GNUC__)
419                 // GCC extension:
420                 // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
421                 // set the new error if we are the first one there.
422                 __sync_val_compare_and_swap(&jobError, CL_SUCCESS, err);
423 
424                 // drop run count to 0
425                 gRunCount = 0;
426                 __sync_synchronize();
427 #elif defined(_MSC_VER)
428                 // set the new error if we are the first one there.
429                 _InterlockedCompareExchange((volatile LONG *)&jobError, err,
430                                             CL_SUCCESS);
431 
432                 // drop run count to 0
433                 gRunCount = 0;
434                 _mm_mfence();
435 #else
436                 if (pthread_mutex_lock(&gAtomicLock))
437                     log_error(
438                         "Atomic operation failed. "
439                         "pthread_mutex_lock(&gAtomicLock) returned an error\n");
440                 if (jobError == CL_SUCCESS) jobError = err;
441                 gRunCount = 0;
442                 if (pthread_mutex_unlock(&gAtomicLock))
443                     log_error("Failed to release gAtomicLock. Further atomic "
444                               "operations may deadlock\n");
445 #endif
446             }
447         }
448 
449         // get the next item
450         item = ThreadPool_AtomicAdd(&gRunCount, -1);
451     }
452 
453 exit:
454     log_info("ThreadPool: thread %d exiting.\n", threadID);
455     ThreadPool_AtomicAdd(&gThreadCount, -1);
456 #if !defined(_WIN32)
457     return NULL;
458 #endif
459 }
460 
461 // SetThreadCount() may be used to artifically set the number of worker threads
462 // If the value is 0 (the default) the number of threads will be determined
463 // based on the number of CPU cores.  If it is a unicore machine, then 2 will be
464 // used, so that we still get some testing for thread safety.
465 //
466 // If count < 2 or the CL_TEST_SINGLE_THREADED environment variable is set then
467 // the code will run single threaded, but will report an error to indicate that
468 // the test is invalid.  This option is intended for debugging purposes only. It
469 // is suggested as a convention that test apps set the thread count to 1 in
470 // response to the -m flag.
471 //
472 // SetThreadCount() must be called before the first call to GetThreadCount() or
473 // ThreadPool_Do(), otherwise the behavior is indefined.
SetThreadCount(int count)474 void SetThreadCount(int count)
475 {
476     if (threadPoolInitErr == CL_SUCCESS)
477     {
478         log_error("Error: It is illegal to set the thread count after the "
479                   "first call to ThreadPool_Do or GetThreadCount\n");
480         abort();
481     }
482 
483     gThreadCount = count;
484 }
485 
ThreadPool_Init(void)486 void ThreadPool_Init(void)
487 {
488     cl_int i;
489     int err;
490     volatile cl_uint threadID = 0;
491 
492     // Check for manual override of multithreading code. We add this for better
493     // debuggability.
494     if (getenv("CL_TEST_SINGLE_THREADED"))
495     {
496         log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. "
497                   "Running single threaded.\n*** TEST IS INVALID! ***\n");
498         gThreadCount = 1;
499         return;
500     }
501 
502     // Figure out how many threads to run -- check first for non-zero to give
503     // the implementation the chance
504     if (0 == gThreadCount)
505     {
506 #if defined(_MSC_VER) || defined(__MINGW64__)
507         PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
508         DWORD length = 0;
509 
510         GetLogicalProcessorInformation(NULL, &length);
511         buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length);
512         if (buffer != NULL)
513         {
514             if (GetLogicalProcessorInformation(buffer, &length) == TRUE)
515             {
516                 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
517                 while (
518                     ptr
519                     < &buffer[length
520                               / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)])
521                 {
522                     if (ptr->Relationship == RelationProcessorCore)
523                     {
524                         // Count the number of bits in ProcessorMask (number of
525                         // logical cores)
526                         ULONG mask = ptr->ProcessorMask;
527                         while (mask)
528                         {
529                             ++gThreadCount;
530                             mask &= mask - 1; // Remove 1 bit at a time
531                         }
532                     }
533                     ++ptr;
534                 }
535             }
536             free(buffer);
537         }
538 #elif defined(__MINGW32__)
539         {
540 #warning How about this, instead of hard coding it to 2?
541             SYSTEM_INFO sysinfo;
542             GetSystemInfo(&sysinfo);
543             gThreadCount = sysinfo.dwNumberOfProcessors;
544         }
545 #elif defined(__linux__) && !defined(__ANDROID__)
546         cpu_set_t affinity;
547         if (0 == sched_getaffinity(0, sizeof(cpu_set_t), &affinity))
548         {
549 #if !(defined(CPU_COUNT))
550             gThreadCount = 1;
551 #else
552             gThreadCount = CPU_COUNT(&affinity);
553 #endif
554         }
555         else
556         {
557             // Hopefully your system returns logical cpus here, as does MacOS X
558             gThreadCount = (cl_int)sysconf(_SC_NPROCESSORS_CONF);
559         }
560 #else /* !_WIN32 */
561         // Hopefully your system returns logical cpus here, as does MacOS X
562         gThreadCount = (cl_int)sysconf(_SC_NPROCESSORS_CONF);
563 #endif // !_WIN32
564 
565         // Multithreaded tests are required to run multithreaded even on unicore
566         // systems so as to test thread safety
567         if (1 == gThreadCount) gThreadCount = 2;
568     }
569 
570 // When working in 32 bit limit the thread number to 12
571 // This fix was made due to memory issues in integer_ops test
572 // When running integer_ops, the test opens as many threads as the
573 // machine has and each thread allocates a fixed amount of memory
574 // When running this test on dual socket machine in 32-bit, the
575 // process memory is not sufficient and the test fails
576 #if defined(_WIN32) && !defined(_M_X64)
577     if (gThreadCount > 12)
578     {
579         gThreadCount = 12;
580     }
581 #endif
582 
583     // Allow the app to set thread count to <0 for debugging purposes.
584     // This will cause the test to run single threaded.
585     if (gThreadCount < 2)
586     {
587         log_error("ERROR: Running single threaded because thread count < 2. "
588                   "\n*** TEST IS INVALID! ***\n");
589         gThreadCount = 1;
590         return;
591     }
592 
593 #if defined(_WIN32)
594     InitializeCriticalSection(gThreadPoolLock);
595     InitializeCriticalSection(cond_lock);
596     _InitializeConditionVariable(cond_var);
597     caller_event = CreateEvent(NULL, FALSE, FALSE, NULL);
598 #elif defined(__GNUC__)
599     // Dont rely on PTHREAD_MUTEX_INITIALIZER for intialization of a mutex since
600     // it might cause problem with some flavors of gcc compilers.
601     pthread_cond_init(&cond_var, NULL);
602     pthread_mutex_init(&cond_lock, NULL);
603     pthread_cond_init(&caller_cond_var, NULL);
604     pthread_mutex_init(&caller_cond_lock, NULL);
605     pthread_mutex_init(&gThreadPoolLock, NULL);
606 #endif
607 
608 #if !(defined(__GNUC__) || defined(_MSC_VER) || defined(__MINGW32__))
609     pthread_mutex_initialize(gAtomicLock);
610 #elif defined(__MINGW32__)
611     InitializeCriticalSection(&gAtomicLock);
612 #endif
613     // Make sure the last thread done in the work pool doesn't signal us to wake
614     // before we get to the point where we are supposed to wait
615     //  That would cause a deadlock.
616 #if !defined(_WIN32)
617     if ((err = pthread_mutex_lock(&caller_cond_lock)))
618     {
619         log_error("Error %d from pthread_mutex_lock. Unable to block for work "
620                   "to finish. ThreadPool_Init failed.\n",
621                   err);
622         gThreadCount = 1;
623         return;
624     }
625 #endif // !_WIN32
626 
627     gRunning = gThreadCount;
628     // init threads
629     for (i = 0; i < gThreadCount; i++)
630     {
631 #if defined(_WIN32)
632         uintptr_t handle =
633             _beginthread(ThreadPool_WorkerFunc, 0, (void *)&threadID);
634         err = (handle == 0);
635 #else // !_WIN32
636         pthread_t tid = 0;
637         err = pthread_create(&tid, NULL, ThreadPool_WorkerFunc,
638                              (void *)&threadID);
639 #endif // !_WIN32
640         if (err)
641         {
642             log_error("Error %d launching thread %d\n", err, i);
643             threadPoolInitErr = err;
644             gThreadCount = i;
645             break;
646         }
647     }
648 
649     atexit(ThreadPool_Exit);
650 
651     // block until they are done launching.
652     do
653     {
654 #if defined(_WIN32)
655         WaitForSingleObject(caller_event, INFINITE);
656 #else // !_WIN32
657         if ((err = pthread_cond_wait(&caller_cond_var, &caller_cond_lock)))
658         {
659             log_error("Error %d from pthread_cond_wait. Unable to block for "
660                       "work to finish. ThreadPool_Init failed.\n",
661                       err);
662             pthread_mutex_unlock(&caller_cond_lock);
663             return;
664         }
665 #endif // !_WIN32
666     } while (gRunCount != -gThreadCount);
667 #if !defined(_WIN32)
668     if ((err = pthread_mutex_unlock(&caller_cond_lock)))
669     {
670         log_error("Error %d from pthread_mutex_unlock. Unable to block for "
671                   "work to finish. ThreadPool_Init failed.\n",
672                   err);
673         return;
674     }
675 #endif // !_WIN32
676 
677     threadPoolInitErr = CL_SUCCESS;
678 }
679 
680 #if defined(_MSC_VER)
_ThreadPool_Init(_PINIT_ONCE InitOnce,PVOID Parameter,PVOID * lpContex)681 static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter,
682                                       PVOID *lpContex)
683 {
684     ThreadPool_Init();
685     return TRUE;
686 }
687 #endif
688 
ThreadPool_Exit(void)689 void ThreadPool_Exit(void)
690 {
691     int err, count;
692     gRunCount = CL_INT_MAX;
693 
694 #if defined(__GNUC__)
695     // GCC extension:
696     // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
697     __sync_synchronize();
698 #elif defined(_MSC_VER)
699     _mm_mfence();
700 #else
701 #warning If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed
702 #endif
703 
704     // spin waiting for threads to die
705     for (count = 0; 0 != gThreadCount && count < 1000; count++)
706     {
707 #if defined(_WIN32)
708         _WakeAllConditionVariable(cond_var);
709         Sleep(1);
710 #else // !_WIN32
711         if ((err = pthread_cond_broadcast(&cond_var)))
712         {
713             log_error("Error %d from pthread_cond_broadcast. Unable to wake up "
714                       "work threads. ThreadPool_Exit failed.\n",
715                       err);
716             break;
717         }
718         usleep(1000);
719 #endif // !_WIN32
720     }
721 
722     if (gThreadCount)
723         log_error("Error: Thread pool timed out after 1 second with %d threads "
724                   "still active.\n",
725                   gThreadCount);
726     else
727         log_info("Thread pool exited in a orderly fashion.\n");
728 }
729 
730 
731 // Blocking API that farms out count jobs to a thread pool.
732 // It may return with some work undone if func_ptr() returns a non-zero
733 // result.
734 //
735 // This function obviously has its shortcommings. Only one call to ThreadPool_Do
736 // can be running at a time. It is not intended for general purpose use.
737 // If clEnqueueNativeKernelFn, out of order queues and a CL_DEVICE_TYPE_CPU were
738 // all available then it would make more sense to use those features.
ThreadPool_Do(TPFuncPtr func_ptr,cl_uint count,void * userInfo)739 cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
740 {
741     cl_int newErr;
742     cl_int err = 0;
743     // Lazily set up our threads
744 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
745     err = !_InitOnceExecuteOnce(&threadpool_init_control, _ThreadPool_Init,
746                                 NULL, NULL);
747 #elif defined(_WIN32)
748     if (threadpool_init_control == 0)
749     {
750 #warning This is buggy and race prone.  Find a better way.
751         ThreadPool_Init();
752         threadpool_init_control = 1;
753     }
754 #else // posix platform
755     err = pthread_once(&threadpool_init_control, ThreadPool_Init);
756     if (err)
757     {
758         log_error("Error %d from pthread_once. Unable to init threads. "
759                   "ThreadPool_Do failed.\n",
760                   err);
761         return err;
762     }
763 #endif
764     // Single threaded code to handle case where threadpool wasn't allocated or
765     // was disabled by environment variable
766     if (threadPoolInitErr)
767     {
768         cl_uint currentJob = 0;
769         cl_int result = CL_SUCCESS;
770 
771 #if defined(__APPLE__) && defined(__arm__)
772         // On most platforms which support denorm, default is FTZ off. However,
773         // on some hardware where the reference is computed, default might be
774         // flush denorms to zero e.g. arm. This creates issues in result
775         // verification. Since spec allows the implementation to either flush or
776         // not flush denorms to zero, an implementation may choose not be flush
777         // i.e. return denorm result whereas reference result may be zero
778         // (flushed denorm). Hence we need to disable denorm flushing on host
779         // side where reference is being computed to make sure we get
780         // non-flushed reference result. If implementation returns flushed
781         // result, we correctly take care of that in verification code.
782         FPU_mode_type oldMode;
783         DisableFTZ(&oldMode);
784 #endif
785         for (currentJob = 0; currentJob < count; currentJob++)
786             if ((result = func_ptr(currentJob, 0, userInfo)))
787             {
788 #if defined(__APPLE__) && defined(__arm__)
789                 // Restore FP state before leaving
790                 RestoreFPState(&oldMode);
791 #endif
792                 return result;
793             }
794 
795 #if defined(__APPLE__) && defined(__arm__)
796         // Restore FP state before leaving
797         RestoreFPState(&oldMode);
798 #endif
799 
800         return CL_SUCCESS;
801     }
802 
803     if (count >= MAX_COUNT)
804     {
805         log_error(
806             "Error: ThreadPool_Do count %d >= max threadpool count of %d\n",
807             count, MAX_COUNT);
808         return -1;
809     }
810 
811     // Enter critical region
812 #if defined(_WIN32)
813     EnterCriticalSection(gThreadPoolLock);
814 #else // !_WIN32
815     if ((err = pthread_mutex_lock(&gThreadPoolLock)))
816     {
817         switch (err)
818         {
819             case EDEADLK:
820                 log_error(
821                     "Error EDEADLK returned in ThreadPool_Do(). ThreadPool_Do "
822                     "is not designed to work recursively!\n");
823                 break;
824             case EINVAL:
825                 log_error("Error EINVAL returned in ThreadPool_Do(). How did "
826                           "we end up with an invalid gThreadPoolLock?\n");
827                 break;
828             default: break;
829         }
830         return err;
831     }
832 #endif // !_WIN32
833 
834     // Start modifying the job state observable by worker threads
835 #if defined(_WIN32)
836     EnterCriticalSection(cond_lock);
837 #else // !_WIN32
838     if ((err = pthread_mutex_lock(&cond_lock)))
839     {
840         log_error("Error %d from pthread_mutex_lock. Unable to wake up work "
841                   "threads. ThreadPool_Do failed.\n",
842                   err);
843         goto exit;
844     }
845 #endif // !_WIN32
846 
847     // Make sure the last thread done in the work pool doesn't signal us to wake
848     // before we get to the point where we are supposed to wait
849     //  That would cause a deadlock.
850 #if !defined(_WIN32)
851     if ((err = pthread_mutex_lock(&caller_cond_lock)))
852     {
853         log_error("Error %d from pthread_mutex_lock. Unable to block for work "
854                   "to finish. ThreadPool_Do failed.\n",
855                   err);
856         goto exit;
857     }
858 #endif // !_WIN32
859 
860     // Prime the worker threads to get going
861     jobError = CL_SUCCESS;
862     gRunCount = gJobCount = count;
863     gFunc_ptr = func_ptr;
864     gUserInfo = userInfo;
865 
866 #if defined(_WIN32)
867     ResetEvent(caller_event);
868     _WakeAllConditionVariable(cond_var);
869     LeaveCriticalSection(cond_lock);
870 #else // !_WIN32
871     if ((err = pthread_cond_broadcast(&cond_var)))
872     {
873         log_error("Error %d from pthread_cond_broadcast. Unable to wake up "
874                   "work threads. ThreadPool_Do failed.\n",
875                   err);
876         goto exit;
877     }
878     if ((err = pthread_mutex_unlock(&cond_lock)))
879     {
880         log_error("Error %d from pthread_mutex_unlock. Unable to wake up work "
881                   "threads. ThreadPool_Do failed.\n",
882                   err);
883         goto exit;
884     }
885 #endif // !_WIN32
886 
887     // block until they are done.  It would be slightly more efficient to do
888     // some of the work here though.
889     do
890     {
891 #if defined(_WIN32)
892         WaitForSingleObject(caller_event, INFINITE);
893 #else // !_WIN32
894         if ((err = pthread_cond_wait(&caller_cond_var, &caller_cond_lock)))
895         {
896             log_error("Error %d from pthread_cond_wait. Unable to block for "
897                       "work to finish. ThreadPool_Do failed.\n",
898                       err);
899             pthread_mutex_unlock(&caller_cond_lock);
900             goto exit;
901         }
902 #endif // !_WIN32
903     } while (gRunning);
904 #if !defined(_WIN32)
905     if ((err = pthread_mutex_unlock(&caller_cond_lock)))
906     {
907         log_error("Error %d from pthread_mutex_unlock. Unable to block for "
908                   "work to finish. ThreadPool_Do failed.\n",
909                   err);
910         goto exit;
911     }
912 #endif // !_WIN32
913 
914     err = jobError;
915 
916 exit:
917     // exit critical region
918 #if defined(_WIN32)
919     LeaveCriticalSection(gThreadPoolLock);
920 #else // !_WIN32
921     newErr = pthread_mutex_unlock(&gThreadPoolLock);
922     if (newErr)
923     {
924         log_error("Error %d from pthread_mutex_unlock. Unable to exit critical "
925                   "region. ThreadPool_Do failed.\n",
926                   newErr);
927         return err;
928     }
929 #endif // !_WIN32
930 
931     return err;
932 }
933 
GetThreadCount(void)934 cl_uint GetThreadCount(void)
935 {
936     // Lazily set up our threads
937 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
938     cl_int err = !_InitOnceExecuteOnce(&threadpool_init_control,
939                                        _ThreadPool_Init, NULL, NULL);
940 #elif defined(_WIN32)
941     if (threadpool_init_control == 0)
942     {
943 #warning This is buggy and race prone.  Find a better way.
944         ThreadPool_Init();
945         threadpool_init_control = 1;
946     }
947 #else
948     cl_int err = pthread_once(&threadpool_init_control, ThreadPool_Init);
949     if (err)
950     {
951         log_error("Error %d from pthread_once. Unable to init threads. "
952                   "ThreadPool_Do failed.\n",
953                   err);
954         return err;
955     }
956 #endif // !_WIN32
957 
958     if (gThreadCount < 1) return 1;
959 
960     return gThreadCount;
961 }
962 
963 #else
964 
965 #ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
966 #error ThreadPool implementation has not been multithreaded for this operating system. You must multithread this section.
967 #endif
968 //
969 // We require multithreading in parts of the test as a means of simultaneously
970 // testing reentrancy requirements of OpenCL API, while also checking
971 //
972 // A sample single threaded implementation follows, for documentation /
973 // bootstrapping purposes. It is not okay to use this for conformance testing!!!
974 //
975 // Exception:  If your operating system does not support multithreaded execution
976 // of any kind, then you may use this code.
977 //
978 
ThreadPool_AtomicAdd(volatile cl_int * a,cl_int b)979 cl_int ThreadPool_AtomicAdd(volatile cl_int *a, cl_int b)
980 {
981     cl_uint r = *a;
982 
983     // since this fallback code path is not multithreaded, we just do a regular
984     // add here. If your operating system supports memory-barrier-atomics, use
985     // those here.
986     *a = r + b;
987 
988     return r;
989 }
990 
991 // Blocking API that farms out count jobs to a thread pool.
992 // It may return with some work undone if func_ptr() returns a non-zero
993 // result.
ThreadPool_Do(TPFuncPtr func_ptr,cl_uint count,void * userInfo)994 cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
995 {
996     cl_uint currentJob = 0;
997     cl_int result = CL_SUCCESS;
998 
999 #ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
1000     // THIS FUNCTION IS NOT INTENDED FOR USE!!
1001     log_error("ERROR:  Test must be multithreaded!\n");
1002     exit(-1);
1003 #else
1004     static int spewCount = 0;
1005 
1006     if (0 == spewCount)
1007     {
1008         log_info("\nWARNING:  The operating system is claimed not to support "
1009                  "threads of any sort. Running single threaded.\n");
1010         spewCount = 1;
1011     }
1012 #endif
1013 
1014     // The multithreaded code should mimic this behavior:
1015     for (currentJob = 0; currentJob < count; currentJob++)
1016         if ((result = func_ptr(currentJob, 0, userInfo))) return result;
1017 
1018     return CL_SUCCESS;
1019 }
1020 
GetThreadCount(void)1021 cl_uint GetThreadCount(void) { return 1; }
1022 
SetThreadCount(int count)1023 void SetThreadCount(int count)
1024 {
1025     if (count > 1) log_info("WARNING: SetThreadCount(%d) ignored\n", count);
1026 }
1027 
1028 #endif
1029