1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "ThreadPool.h"
17 #include "errorHelpers.h"
18 #include "fpcontrol.h"
19 #include <stdio.h>
20 #include <stdlib.h>
21
22 #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
23 // or any other POSIX system
24
25 #if defined(_WIN32)
26 #include <windows.h>
27 #if defined(_MSC_VER)
28 #include <intrin.h>
29 #endif
30 #include "mingw_compat.h"
31 #include <process.h>
32 #else // !_WIN32
33 #include <pthread.h>
34 #include <unistd.h>
35 #include <sys/errno.h>
36 #ifdef __linux__
37 #include <sched.h>
38 #endif
39 #endif // !_WIN32
40
41 // declarations
42 #ifdef _WIN32
43 void ThreadPool_WorkerFunc(void *p);
44 #else
45 void *ThreadPool_WorkerFunc(void *p);
46 #endif
47 void ThreadPool_Init(void);
48 void ThreadPool_Exit(void);
49
50 #if defined(__MINGW32__)
51 // Mutex for implementing super heavy atomic operations if you don't have GCC or
52 // MSVC
53 CRITICAL_SECTION gAtomicLock;
54 #elif defined(__GNUC__) || defined(_MSC_VER)
55 #else
56 pthread_mutex_t gAtomicLock;
57 #endif
58
59 // Atomic add operator with mem barrier. Mem barrier needed to protect state
60 // modified by the worker functions.
ThreadPool_AtomicAdd(volatile cl_int * a,cl_int b)61 cl_int ThreadPool_AtomicAdd(volatile cl_int *a, cl_int b)
62 {
63 #if defined(__MINGW32__)
64 // No atomics on Mingw32
65 EnterCriticalSection(&gAtomicLock);
66 cl_int old = *a;
67 *a = old + b;
68 LeaveCriticalSection(&gAtomicLock);
69 return old;
70 #elif defined(__GNUC__)
71 // GCC extension:
72 // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
73 return __sync_fetch_and_add(a, b);
74 // do we need __sync_synchronize() here, too? GCC docs are unclear whether
75 // __sync_fetch_and_add does a synchronize
76 #elif defined(_MSC_VER)
77 return (cl_int)_InterlockedExchangeAdd((volatile LONG *)a, (LONG)b);
78 #else
79 #warning Please add a atomic add implementation here, with memory barrier. Fallback code is slow.
80 if (pthread_mutex_lock(&gAtomicLock))
81 log_error("Atomic operation failed. pthread_mutex_lock(&gAtomicLock) "
82 "returned an error\n");
83 cl_int old = *a;
84 *a = old + b;
85 if (pthread_mutex_unlock(&gAtomicLock))
86 log_error("Failed to release gAtomicLock. Further atomic operations "
87 "may deadlock!\n");
88 return old;
89 #endif
90 }
91
92 #if defined(_WIN32)
93 // Uncomment the following line if Windows XP support is not required.
94 // #define HAS_INIT_ONCE_EXECUTE_ONCE 1
95
96 #if defined(HAS_INIT_ONCE_EXECUTE_ONCE)
97 #define _INIT_ONCE INIT_ONCE
98 #define _PINIT_ONCE PINIT_ONCE
99 #define _InitOnceExecuteOnce InitOnceExecuteOnce
100 #else // !HAS_INIT_ONCE_EXECUTE_ONCE
101
102 typedef volatile LONG _INIT_ONCE;
103 typedef _INIT_ONCE *_PINIT_ONCE;
104 typedef BOOL(CALLBACK *_PINIT_ONCE_FN)(_PINIT_ONCE, PVOID, PVOID *);
105
106 #define _INIT_ONCE_UNINITIALIZED 0
107 #define _INIT_ONCE_IN_PROGRESS 1
108 #define _INIT_ONCE_DONE 2
109
_InitOnceExecuteOnce(_PINIT_ONCE InitOnce,_PINIT_ONCE_FN InitFn,PVOID Parameter,LPVOID * Context)110 static BOOL _InitOnceExecuteOnce(_PINIT_ONCE InitOnce, _PINIT_ONCE_FN InitFn,
111 PVOID Parameter, LPVOID *Context)
112 {
113 while (*InitOnce != _INIT_ONCE_DONE)
114 {
115 if (*InitOnce != _INIT_ONCE_IN_PROGRESS
116 && _InterlockedCompareExchange(InitOnce, _INIT_ONCE_IN_PROGRESS,
117 _INIT_ONCE_UNINITIALIZED)
118 == _INIT_ONCE_UNINITIALIZED)
119 {
120 InitFn(InitOnce, Parameter, Context);
121 *InitOnce = _INIT_ONCE_DONE;
122 return TRUE;
123 }
124 Sleep(1);
125 }
126 return TRUE;
127 }
128 #endif // !HAS_INIT_ONCE_EXECUTE_ONCE
129
130 // Uncomment the following line if Windows XP support is not required.
131 // #define HAS_CONDITION_VARIABLE 1
132
133 #if defined(HAS_CONDITION_VARIABLE)
134 #define _CONDITION_VARIABLE CONDITION_VARIABLE
135 #define _InitializeConditionVariable InitializeConditionVariable
136 #define _SleepConditionVariableCS SleepConditionVariableCS
137 #define _WakeAllConditionVariable WakeAllConditionVariable
138 #else // !HAS_CONDITION_VARIABLE
139 typedef struct
140 {
141 HANDLE mEvent; // Used to park the thread.
142 // Used to protect mWaiters, mGeneration and mReleaseCount:
143 CRITICAL_SECTION mLock[1];
144 volatile cl_int mWaiters; // Number of threads waiting on this cond var.
145 volatile cl_int mGeneration; // Wait generation count.
146 volatile cl_int mReleaseCount; // Number of releases to execute before
147 // reseting the event.
148 } _CONDITION_VARIABLE;
149
150 typedef _CONDITION_VARIABLE *_PCONDITION_VARIABLE;
151
_InitializeConditionVariable(_PCONDITION_VARIABLE cond_var)152 static void _InitializeConditionVariable(_PCONDITION_VARIABLE cond_var)
153 {
154 cond_var->mEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
155 InitializeCriticalSection(cond_var->mLock);
156 cond_var->mWaiters = 0;
157 cond_var->mGeneration = 0;
158 #if !defined(NDEBUG)
159 cond_var->mReleaseCount = 0;
160 #endif // !NDEBUG
161 }
162
_SleepConditionVariableCS(_PCONDITION_VARIABLE cond_var,PCRITICAL_SECTION cond_lock,DWORD ignored)163 static void _SleepConditionVariableCS(_PCONDITION_VARIABLE cond_var,
164 PCRITICAL_SECTION cond_lock,
165 DWORD ignored)
166 {
167 EnterCriticalSection(cond_var->mLock);
168 cl_int generation = cond_var->mGeneration;
169 ++cond_var->mWaiters;
170 LeaveCriticalSection(cond_var->mLock);
171 LeaveCriticalSection(cond_lock);
172
173 while (TRUE)
174 {
175 WaitForSingleObject(cond_var->mEvent, INFINITE);
176 EnterCriticalSection(cond_var->mLock);
177 BOOL done =
178 cond_var->mReleaseCount > 0 && cond_var->mGeneration != generation;
179 LeaveCriticalSection(cond_var->mLock);
180 if (done)
181 {
182 break;
183 }
184 }
185
186 EnterCriticalSection(cond_lock);
187 EnterCriticalSection(cond_var->mLock);
188 if (--cond_var->mReleaseCount == 0)
189 {
190 ResetEvent(cond_var->mEvent);
191 }
192 --cond_var->mWaiters;
193 LeaveCriticalSection(cond_var->mLock);
194 }
195
_WakeAllConditionVariable(_PCONDITION_VARIABLE cond_var)196 static void _WakeAllConditionVariable(_PCONDITION_VARIABLE cond_var)
197 {
198 EnterCriticalSection(cond_var->mLock);
199 if (cond_var->mWaiters > 0)
200 {
201 ++cond_var->mGeneration;
202 cond_var->mReleaseCount = cond_var->mWaiters;
203 SetEvent(cond_var->mEvent);
204 }
205 LeaveCriticalSection(cond_var->mLock);
206 }
207 #endif // !HAS_CONDITION_VARIABLE
208 #endif // _WIN32
209
210 #define MAX_COUNT (1 << 29)
211
212 // Global state to coordinate whether the threads have been launched
213 // successfully or not
214 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
215 static _INIT_ONCE threadpool_init_control;
216 #elif defined(_WIN32) // MingW of XP
217 static int threadpool_init_control;
218 #else // Posix platforms
219 pthread_once_t threadpool_init_control = PTHREAD_ONCE_INIT;
220 #endif
221 cl_int threadPoolInitErr = -1; // set to CL_SUCCESS on successful thread launch
222
223 // critical region lock around ThreadPool_Do. We can only run one ThreadPool_Do
224 // at a time, because we are too lazy to set up a queue here, and don't expect
225 // to need one.
226 #if defined(_WIN32)
227 CRITICAL_SECTION gThreadPoolLock[1];
228 #else // !_WIN32
229 pthread_mutex_t gThreadPoolLock;
230 #endif // !_WIN32
231
232 // Condition variable to park ThreadPool threads when not working
233 #if defined(_WIN32)
234 CRITICAL_SECTION cond_lock[1];
235 _CONDITION_VARIABLE cond_var[1];
236 #else // !_WIN32
237 pthread_mutex_t cond_lock;
238 pthread_cond_t cond_var;
239 #endif // !_WIN32
240
241 // Condition variable state. How many iterations on the function left to run,
242 // set to CL_INT_MAX to cause worker threads to exit. Note: this value might
243 // go negative.
244 volatile cl_int gRunCount = 0;
245
246 // State that only changes when the threadpool is not working.
247 volatile TPFuncPtr gFunc_ptr = NULL;
248 volatile void *gUserInfo = NULL;
249 volatile cl_int gJobCount = 0;
250
251 // State that may change while the thread pool is working
252 volatile cl_int jobError = CL_SUCCESS; // err code return for the job as a whole
253
254 // Condition variable to park caller while waiting
255 #if defined(_WIN32)
256 HANDLE caller_event;
257 #else // !_WIN32
258 pthread_mutex_t caller_cond_lock;
259 pthread_cond_t caller_cond_var;
260 #endif // !_WIN32
261
262 // # of threads intended to be running. Running threads will decrement this
263 // as they discover they've run out of work to do.
264 volatile cl_int gRunning = 0;
265
266 // The total number of threads launched.
267 volatile cl_int gThreadCount = 0;
268 #ifdef _WIN32
ThreadPool_WorkerFunc(void * p)269 void ThreadPool_WorkerFunc(void *p)
270 #else
271 void *ThreadPool_WorkerFunc(void *p)
272 #endif
273 {
274 cl_uint threadID = ThreadPool_AtomicAdd((volatile cl_int *)p, 1);
275 cl_int item = ThreadPool_AtomicAdd(&gRunCount, -1);
276 // log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning );
277
278 while (MAX_COUNT > item)
279 {
280 cl_int err;
281
282 // check for more work to do
283 if (0 >= item)
284 {
285 // log_info("Thread %d has run out of work.\n", threadID);
286
287 // No work to do. Attempt to block waiting for work
288 #if defined(_WIN32)
289 EnterCriticalSection(cond_lock);
290 #else // !_WIN32
291 if ((err = pthread_mutex_lock(&cond_lock)))
292 {
293 log_error(
294 "Error %d from pthread_mutex_lock. Worker %d unable to "
295 "block waiting for work. ThreadPool_WorkerFunc failed.\n",
296 err, threadID);
297 goto exit;
298 }
299 #endif // !_WIN32
300
301 cl_int remaining = ThreadPool_AtomicAdd(&gRunning, -1);
302 // log_info("ThreadPool_WorkerFunc: gRunning = %d\n",
303 // remaining - 1);
304 if (1 == remaining)
305 { // last thread out signal the main thread to wake up
306 #if defined(_WIN32)
307 SetEvent(caller_event);
308 #else // !_WIN32
309 if ((err = pthread_mutex_lock(&caller_cond_lock)))
310 {
311 log_error("Error %d from pthread_mutex_lock. Unable to "
312 "wake caller.\n",
313 err);
314 goto exit;
315 }
316 if ((err = pthread_cond_broadcast(&caller_cond_var)))
317 {
318 log_error(
319 "Error %d from pthread_cond_broadcast. Unable to wake "
320 "up main thread. ThreadPool_WorkerFunc failed.\n",
321 err);
322 goto exit;
323 }
324 if ((err = pthread_mutex_unlock(&caller_cond_lock)))
325 {
326 log_error("Error %d from pthread_mutex_lock. Unable to "
327 "wake caller.\n",
328 err);
329 goto exit;
330 }
331 #endif // !_WIN32
332 }
333
334 // loop in case we are woken only to discover that some other thread
335 // already did all the work
336 while (0 >= item)
337 {
338 #if defined(_WIN32)
339 _SleepConditionVariableCS(cond_var, cond_lock, INFINITE);
340 #else // !_WIN32
341 if ((err = pthread_cond_wait(&cond_var, &cond_lock)))
342 {
343 log_error(
344 "Error %d from pthread_cond_wait. Unable to block for "
345 "waiting for work. ThreadPool_WorkerFunc failed.\n",
346 err);
347 pthread_mutex_unlock(&cond_lock);
348 goto exit;
349 }
350 #endif // !_WIN32
351
352 // try again to get a valid item id
353 item = ThreadPool_AtomicAdd(&gRunCount, -1);
354 if (MAX_COUNT <= item) // exit if we are done
355 {
356 #if defined(_WIN32)
357 LeaveCriticalSection(cond_lock);
358 #else // !_WIN32
359 pthread_mutex_unlock(&cond_lock);
360 #endif // !_WIN32
361 goto exit;
362 }
363 }
364
365 ThreadPool_AtomicAdd(&gRunning, 1);
366 // log_info("Thread %d has found work.\n", threadID);
367
368 #if defined(_WIN32)
369 LeaveCriticalSection(cond_lock);
370 #else // !_WIN32
371 if ((err = pthread_mutex_unlock(&cond_lock)))
372 {
373 log_error(
374 "Error %d from pthread_mutex_unlock. Unable to block for "
375 "waiting for work. ThreadPool_WorkerFunc failed.\n",
376 err);
377 goto exit;
378 }
379 #endif // !_WIN32
380 }
381
382 // we have a valid item, so do the work
383 // but only if we haven't already encountered an error
384 if (CL_SUCCESS == jobError)
385 {
386 // log_info("Thread %d doing job %d\n", threadID, item - 1);
387
388 #if defined(__APPLE__) && defined(__arm__)
389 // On most platforms which support denorm, default is FTZ off.
390 // However, on some hardware where the reference is computed,
391 // default might be flush denorms to zero e.g. arm. This creates
392 // issues in result verification. Since spec allows the
393 // implementation to either flush or not flush denorms to zero, an
394 // implementation may choose not be flush i.e. return denorm result
395 // whereas reference result may be zero (flushed denorm). Hence we
396 // need to disable denorm flushing on host side where reference is
397 // being computed to make sure we get non-flushed reference result.
398 // If implementation returns flushed result, we correctly take care
399 // of that in verification code.
400 FPU_mode_type oldMode;
401 DisableFTZ(&oldMode);
402 #endif
403
404 // Call the user's function with this item ID
405 err = gFunc_ptr(item - 1, threadID, (void *)gUserInfo);
406 #if defined(__APPLE__) && defined(__arm__)
407 // Restore FP state
408 RestoreFPState(&oldMode);
409 #endif
410
411 if (err)
412 {
413 #if (__MINGW32__)
414 EnterCriticalSection(&gAtomicLock);
415 if (jobError == CL_SUCCESS) jobError = err;
416 gRunCount = 0;
417 LeaveCriticalSection(&gAtomicLock);
418 #elif defined(__GNUC__)
419 // GCC extension:
420 // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
421 // set the new error if we are the first one there.
422 __sync_val_compare_and_swap(&jobError, CL_SUCCESS, err);
423
424 // drop run count to 0
425 gRunCount = 0;
426 __sync_synchronize();
427 #elif defined(_MSC_VER)
428 // set the new error if we are the first one there.
429 _InterlockedCompareExchange((volatile LONG *)&jobError, err,
430 CL_SUCCESS);
431
432 // drop run count to 0
433 gRunCount = 0;
434 _mm_mfence();
435 #else
436 if (pthread_mutex_lock(&gAtomicLock))
437 log_error(
438 "Atomic operation failed. "
439 "pthread_mutex_lock(&gAtomicLock) returned an error\n");
440 if (jobError == CL_SUCCESS) jobError = err;
441 gRunCount = 0;
442 if (pthread_mutex_unlock(&gAtomicLock))
443 log_error("Failed to release gAtomicLock. Further atomic "
444 "operations may deadlock\n");
445 #endif
446 }
447 }
448
449 // get the next item
450 item = ThreadPool_AtomicAdd(&gRunCount, -1);
451 }
452
453 exit:
454 log_info("ThreadPool: thread %d exiting.\n", threadID);
455 ThreadPool_AtomicAdd(&gThreadCount, -1);
456 #if !defined(_WIN32)
457 return NULL;
458 #endif
459 }
460
461 // SetThreadCount() may be used to artifically set the number of worker threads
462 // If the value is 0 (the default) the number of threads will be determined
463 // based on the number of CPU cores. If it is a unicore machine, then 2 will be
464 // used, so that we still get some testing for thread safety.
465 //
466 // If count < 2 or the CL_TEST_SINGLE_THREADED environment variable is set then
467 // the code will run single threaded, but will report an error to indicate that
468 // the test is invalid. This option is intended for debugging purposes only. It
469 // is suggested as a convention that test apps set the thread count to 1 in
470 // response to the -m flag.
471 //
472 // SetThreadCount() must be called before the first call to GetThreadCount() or
473 // ThreadPool_Do(), otherwise the behavior is indefined.
SetThreadCount(int count)474 void SetThreadCount(int count)
475 {
476 if (threadPoolInitErr == CL_SUCCESS)
477 {
478 log_error("Error: It is illegal to set the thread count after the "
479 "first call to ThreadPool_Do or GetThreadCount\n");
480 abort();
481 }
482
483 gThreadCount = count;
484 }
485
ThreadPool_Init(void)486 void ThreadPool_Init(void)
487 {
488 cl_int i;
489 int err;
490 volatile cl_uint threadID = 0;
491
492 // Check for manual override of multithreading code. We add this for better
493 // debuggability.
494 if (getenv("CL_TEST_SINGLE_THREADED"))
495 {
496 log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. "
497 "Running single threaded.\n*** TEST IS INVALID! ***\n");
498 gThreadCount = 1;
499 return;
500 }
501
502 // Figure out how many threads to run -- check first for non-zero to give
503 // the implementation the chance
504 if (0 == gThreadCount)
505 {
506 #if defined(_MSC_VER) || defined(__MINGW64__)
507 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
508 DWORD length = 0;
509
510 GetLogicalProcessorInformation(NULL, &length);
511 buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length);
512 if (buffer != NULL)
513 {
514 if (GetLogicalProcessorInformation(buffer, &length) == TRUE)
515 {
516 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
517 while (
518 ptr
519 < &buffer[length
520 / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)])
521 {
522 if (ptr->Relationship == RelationProcessorCore)
523 {
524 // Count the number of bits in ProcessorMask (number of
525 // logical cores)
526 ULONG mask = ptr->ProcessorMask;
527 while (mask)
528 {
529 ++gThreadCount;
530 mask &= mask - 1; // Remove 1 bit at a time
531 }
532 }
533 ++ptr;
534 }
535 }
536 free(buffer);
537 }
538 #elif defined(__MINGW32__)
539 {
540 #warning How about this, instead of hard coding it to 2?
541 SYSTEM_INFO sysinfo;
542 GetSystemInfo(&sysinfo);
543 gThreadCount = sysinfo.dwNumberOfProcessors;
544 }
545 #elif defined(__linux__) && !defined(__ANDROID__)
546 cpu_set_t affinity;
547 if (0 == sched_getaffinity(0, sizeof(cpu_set_t), &affinity))
548 {
549 #if !(defined(CPU_COUNT))
550 gThreadCount = 1;
551 #else
552 gThreadCount = CPU_COUNT(&affinity);
553 #endif
554 }
555 else
556 {
557 // Hopefully your system returns logical cpus here, as does MacOS X
558 gThreadCount = (cl_int)sysconf(_SC_NPROCESSORS_CONF);
559 }
560 #else /* !_WIN32 */
561 // Hopefully your system returns logical cpus here, as does MacOS X
562 gThreadCount = (cl_int)sysconf(_SC_NPROCESSORS_CONF);
563 #endif // !_WIN32
564
565 // Multithreaded tests are required to run multithreaded even on unicore
566 // systems so as to test thread safety
567 if (1 == gThreadCount) gThreadCount = 2;
568 }
569
570 // When working in 32 bit limit the thread number to 12
571 // This fix was made due to memory issues in integer_ops test
572 // When running integer_ops, the test opens as many threads as the
573 // machine has and each thread allocates a fixed amount of memory
574 // When running this test on dual socket machine in 32-bit, the
575 // process memory is not sufficient and the test fails
576 #if defined(_WIN32) && !defined(_M_X64)
577 if (gThreadCount > 12)
578 {
579 gThreadCount = 12;
580 }
581 #endif
582
583 // Allow the app to set thread count to <0 for debugging purposes.
584 // This will cause the test to run single threaded.
585 if (gThreadCount < 2)
586 {
587 log_error("ERROR: Running single threaded because thread count < 2. "
588 "\n*** TEST IS INVALID! ***\n");
589 gThreadCount = 1;
590 return;
591 }
592
593 #if defined(_WIN32)
594 InitializeCriticalSection(gThreadPoolLock);
595 InitializeCriticalSection(cond_lock);
596 _InitializeConditionVariable(cond_var);
597 caller_event = CreateEvent(NULL, FALSE, FALSE, NULL);
598 #elif defined(__GNUC__)
599 // Dont rely on PTHREAD_MUTEX_INITIALIZER for intialization of a mutex since
600 // it might cause problem with some flavors of gcc compilers.
601 pthread_cond_init(&cond_var, NULL);
602 pthread_mutex_init(&cond_lock, NULL);
603 pthread_cond_init(&caller_cond_var, NULL);
604 pthread_mutex_init(&caller_cond_lock, NULL);
605 pthread_mutex_init(&gThreadPoolLock, NULL);
606 #endif
607
608 #if !(defined(__GNUC__) || defined(_MSC_VER) || defined(__MINGW32__))
609 pthread_mutex_initialize(gAtomicLock);
610 #elif defined(__MINGW32__)
611 InitializeCriticalSection(&gAtomicLock);
612 #endif
613 // Make sure the last thread done in the work pool doesn't signal us to wake
614 // before we get to the point where we are supposed to wait
615 // That would cause a deadlock.
616 #if !defined(_WIN32)
617 if ((err = pthread_mutex_lock(&caller_cond_lock)))
618 {
619 log_error("Error %d from pthread_mutex_lock. Unable to block for work "
620 "to finish. ThreadPool_Init failed.\n",
621 err);
622 gThreadCount = 1;
623 return;
624 }
625 #endif // !_WIN32
626
627 gRunning = gThreadCount;
628 // init threads
629 for (i = 0; i < gThreadCount; i++)
630 {
631 #if defined(_WIN32)
632 uintptr_t handle =
633 _beginthread(ThreadPool_WorkerFunc, 0, (void *)&threadID);
634 err = (handle == 0);
635 #else // !_WIN32
636 pthread_t tid = 0;
637 err = pthread_create(&tid, NULL, ThreadPool_WorkerFunc,
638 (void *)&threadID);
639 #endif // !_WIN32
640 if (err)
641 {
642 log_error("Error %d launching thread %d\n", err, i);
643 threadPoolInitErr = err;
644 gThreadCount = i;
645 break;
646 }
647 }
648
649 atexit(ThreadPool_Exit);
650
651 // block until they are done launching.
652 do
653 {
654 #if defined(_WIN32)
655 WaitForSingleObject(caller_event, INFINITE);
656 #else // !_WIN32
657 if ((err = pthread_cond_wait(&caller_cond_var, &caller_cond_lock)))
658 {
659 log_error("Error %d from pthread_cond_wait. Unable to block for "
660 "work to finish. ThreadPool_Init failed.\n",
661 err);
662 pthread_mutex_unlock(&caller_cond_lock);
663 return;
664 }
665 #endif // !_WIN32
666 } while (gRunCount != -gThreadCount);
667 #if !defined(_WIN32)
668 if ((err = pthread_mutex_unlock(&caller_cond_lock)))
669 {
670 log_error("Error %d from pthread_mutex_unlock. Unable to block for "
671 "work to finish. ThreadPool_Init failed.\n",
672 err);
673 return;
674 }
675 #endif // !_WIN32
676
677 threadPoolInitErr = CL_SUCCESS;
678 }
679
680 #if defined(_MSC_VER)
_ThreadPool_Init(_PINIT_ONCE InitOnce,PVOID Parameter,PVOID * lpContex)681 static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter,
682 PVOID *lpContex)
683 {
684 ThreadPool_Init();
685 return TRUE;
686 }
687 #endif
688
ThreadPool_Exit(void)689 void ThreadPool_Exit(void)
690 {
691 int err, count;
692 gRunCount = CL_INT_MAX;
693
694 #if defined(__GNUC__)
695 // GCC extension:
696 // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
697 __sync_synchronize();
698 #elif defined(_MSC_VER)
699 _mm_mfence();
700 #else
701 #warning If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed
702 #endif
703
704 // spin waiting for threads to die
705 for (count = 0; 0 != gThreadCount && count < 1000; count++)
706 {
707 #if defined(_WIN32)
708 _WakeAllConditionVariable(cond_var);
709 Sleep(1);
710 #else // !_WIN32
711 if ((err = pthread_cond_broadcast(&cond_var)))
712 {
713 log_error("Error %d from pthread_cond_broadcast. Unable to wake up "
714 "work threads. ThreadPool_Exit failed.\n",
715 err);
716 break;
717 }
718 usleep(1000);
719 #endif // !_WIN32
720 }
721
722 if (gThreadCount)
723 log_error("Error: Thread pool timed out after 1 second with %d threads "
724 "still active.\n",
725 gThreadCount);
726 else
727 log_info("Thread pool exited in a orderly fashion.\n");
728 }
729
730
731 // Blocking API that farms out count jobs to a thread pool.
732 // It may return with some work undone if func_ptr() returns a non-zero
733 // result.
734 //
735 // This function obviously has its shortcommings. Only one call to ThreadPool_Do
736 // can be running at a time. It is not intended for general purpose use.
737 // If clEnqueueNativeKernelFn, out of order queues and a CL_DEVICE_TYPE_CPU were
738 // all available then it would make more sense to use those features.
ThreadPool_Do(TPFuncPtr func_ptr,cl_uint count,void * userInfo)739 cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
740 {
741 cl_int newErr;
742 cl_int err = 0;
743 // Lazily set up our threads
744 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
745 err = !_InitOnceExecuteOnce(&threadpool_init_control, _ThreadPool_Init,
746 NULL, NULL);
747 #elif defined(_WIN32)
748 if (threadpool_init_control == 0)
749 {
750 #warning This is buggy and race prone. Find a better way.
751 ThreadPool_Init();
752 threadpool_init_control = 1;
753 }
754 #else // posix platform
755 err = pthread_once(&threadpool_init_control, ThreadPool_Init);
756 if (err)
757 {
758 log_error("Error %d from pthread_once. Unable to init threads. "
759 "ThreadPool_Do failed.\n",
760 err);
761 return err;
762 }
763 #endif
764 // Single threaded code to handle case where threadpool wasn't allocated or
765 // was disabled by environment variable
766 if (threadPoolInitErr)
767 {
768 cl_uint currentJob = 0;
769 cl_int result = CL_SUCCESS;
770
771 #if defined(__APPLE__) && defined(__arm__)
772 // On most platforms which support denorm, default is FTZ off. However,
773 // on some hardware where the reference is computed, default might be
774 // flush denorms to zero e.g. arm. This creates issues in result
775 // verification. Since spec allows the implementation to either flush or
776 // not flush denorms to zero, an implementation may choose not be flush
777 // i.e. return denorm result whereas reference result may be zero
778 // (flushed denorm). Hence we need to disable denorm flushing on host
779 // side where reference is being computed to make sure we get
780 // non-flushed reference result. If implementation returns flushed
781 // result, we correctly take care of that in verification code.
782 FPU_mode_type oldMode;
783 DisableFTZ(&oldMode);
784 #endif
785 for (currentJob = 0; currentJob < count; currentJob++)
786 if ((result = func_ptr(currentJob, 0, userInfo)))
787 {
788 #if defined(__APPLE__) && defined(__arm__)
789 // Restore FP state before leaving
790 RestoreFPState(&oldMode);
791 #endif
792 return result;
793 }
794
795 #if defined(__APPLE__) && defined(__arm__)
796 // Restore FP state before leaving
797 RestoreFPState(&oldMode);
798 #endif
799
800 return CL_SUCCESS;
801 }
802
803 if (count >= MAX_COUNT)
804 {
805 log_error(
806 "Error: ThreadPool_Do count %d >= max threadpool count of %d\n",
807 count, MAX_COUNT);
808 return -1;
809 }
810
811 // Enter critical region
812 #if defined(_WIN32)
813 EnterCriticalSection(gThreadPoolLock);
814 #else // !_WIN32
815 if ((err = pthread_mutex_lock(&gThreadPoolLock)))
816 {
817 switch (err)
818 {
819 case EDEADLK:
820 log_error(
821 "Error EDEADLK returned in ThreadPool_Do(). ThreadPool_Do "
822 "is not designed to work recursively!\n");
823 break;
824 case EINVAL:
825 log_error("Error EINVAL returned in ThreadPool_Do(). How did "
826 "we end up with an invalid gThreadPoolLock?\n");
827 break;
828 default: break;
829 }
830 return err;
831 }
832 #endif // !_WIN32
833
834 // Start modifying the job state observable by worker threads
835 #if defined(_WIN32)
836 EnterCriticalSection(cond_lock);
837 #else // !_WIN32
838 if ((err = pthread_mutex_lock(&cond_lock)))
839 {
840 log_error("Error %d from pthread_mutex_lock. Unable to wake up work "
841 "threads. ThreadPool_Do failed.\n",
842 err);
843 goto exit;
844 }
845 #endif // !_WIN32
846
847 // Make sure the last thread done in the work pool doesn't signal us to wake
848 // before we get to the point where we are supposed to wait
849 // That would cause a deadlock.
850 #if !defined(_WIN32)
851 if ((err = pthread_mutex_lock(&caller_cond_lock)))
852 {
853 log_error("Error %d from pthread_mutex_lock. Unable to block for work "
854 "to finish. ThreadPool_Do failed.\n",
855 err);
856 goto exit;
857 }
858 #endif // !_WIN32
859
860 // Prime the worker threads to get going
861 jobError = CL_SUCCESS;
862 gRunCount = gJobCount = count;
863 gFunc_ptr = func_ptr;
864 gUserInfo = userInfo;
865
866 #if defined(_WIN32)
867 ResetEvent(caller_event);
868 _WakeAllConditionVariable(cond_var);
869 LeaveCriticalSection(cond_lock);
870 #else // !_WIN32
871 if ((err = pthread_cond_broadcast(&cond_var)))
872 {
873 log_error("Error %d from pthread_cond_broadcast. Unable to wake up "
874 "work threads. ThreadPool_Do failed.\n",
875 err);
876 goto exit;
877 }
878 if ((err = pthread_mutex_unlock(&cond_lock)))
879 {
880 log_error("Error %d from pthread_mutex_unlock. Unable to wake up work "
881 "threads. ThreadPool_Do failed.\n",
882 err);
883 goto exit;
884 }
885 #endif // !_WIN32
886
887 // block until they are done. It would be slightly more efficient to do
888 // some of the work here though.
889 do
890 {
891 #if defined(_WIN32)
892 WaitForSingleObject(caller_event, INFINITE);
893 #else // !_WIN32
894 if ((err = pthread_cond_wait(&caller_cond_var, &caller_cond_lock)))
895 {
896 log_error("Error %d from pthread_cond_wait. Unable to block for "
897 "work to finish. ThreadPool_Do failed.\n",
898 err);
899 pthread_mutex_unlock(&caller_cond_lock);
900 goto exit;
901 }
902 #endif // !_WIN32
903 } while (gRunning);
904 #if !defined(_WIN32)
905 if ((err = pthread_mutex_unlock(&caller_cond_lock)))
906 {
907 log_error("Error %d from pthread_mutex_unlock. Unable to block for "
908 "work to finish. ThreadPool_Do failed.\n",
909 err);
910 goto exit;
911 }
912 #endif // !_WIN32
913
914 err = jobError;
915
916 exit:
917 // exit critical region
918 #if defined(_WIN32)
919 LeaveCriticalSection(gThreadPoolLock);
920 #else // !_WIN32
921 newErr = pthread_mutex_unlock(&gThreadPoolLock);
922 if (newErr)
923 {
924 log_error("Error %d from pthread_mutex_unlock. Unable to exit critical "
925 "region. ThreadPool_Do failed.\n",
926 newErr);
927 return err;
928 }
929 #endif // !_WIN32
930
931 return err;
932 }
933
GetThreadCount(void)934 cl_uint GetThreadCount(void)
935 {
936 // Lazily set up our threads
937 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
938 cl_int err = !_InitOnceExecuteOnce(&threadpool_init_control,
939 _ThreadPool_Init, NULL, NULL);
940 #elif defined(_WIN32)
941 if (threadpool_init_control == 0)
942 {
943 #warning This is buggy and race prone. Find a better way.
944 ThreadPool_Init();
945 threadpool_init_control = 1;
946 }
947 #else
948 cl_int err = pthread_once(&threadpool_init_control, ThreadPool_Init);
949 if (err)
950 {
951 log_error("Error %d from pthread_once. Unable to init threads. "
952 "ThreadPool_Do failed.\n",
953 err);
954 return err;
955 }
956 #endif // !_WIN32
957
958 if (gThreadCount < 1) return 1;
959
960 return gThreadCount;
961 }
962
963 #else
964
965 #ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
966 #error ThreadPool implementation has not been multithreaded for this operating system. You must multithread this section.
967 #endif
968 //
969 // We require multithreading in parts of the test as a means of simultaneously
970 // testing reentrancy requirements of OpenCL API, while also checking
971 //
972 // A sample single threaded implementation follows, for documentation /
973 // bootstrapping purposes. It is not okay to use this for conformance testing!!!
974 //
975 // Exception: If your operating system does not support multithreaded execution
976 // of any kind, then you may use this code.
977 //
978
ThreadPool_AtomicAdd(volatile cl_int * a,cl_int b)979 cl_int ThreadPool_AtomicAdd(volatile cl_int *a, cl_int b)
980 {
981 cl_uint r = *a;
982
983 // since this fallback code path is not multithreaded, we just do a regular
984 // add here. If your operating system supports memory-barrier-atomics, use
985 // those here.
986 *a = r + b;
987
988 return r;
989 }
990
991 // Blocking API that farms out count jobs to a thread pool.
992 // It may return with some work undone if func_ptr() returns a non-zero
993 // result.
ThreadPool_Do(TPFuncPtr func_ptr,cl_uint count,void * userInfo)994 cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
995 {
996 cl_uint currentJob = 0;
997 cl_int result = CL_SUCCESS;
998
999 #ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
1000 // THIS FUNCTION IS NOT INTENDED FOR USE!!
1001 log_error("ERROR: Test must be multithreaded!\n");
1002 exit(-1);
1003 #else
1004 static int spewCount = 0;
1005
1006 if (0 == spewCount)
1007 {
1008 log_info("\nWARNING: The operating system is claimed not to support "
1009 "threads of any sort. Running single threaded.\n");
1010 spewCount = 1;
1011 }
1012 #endif
1013
1014 // The multithreaded code should mimic this behavior:
1015 for (currentJob = 0; currentJob < count; currentJob++)
1016 if ((result = func_ptr(currentJob, 0, userInfo))) return result;
1017
1018 return CL_SUCCESS;
1019 }
1020
GetThreadCount(void)1021 cl_uint GetThreadCount(void) { return 1; }
1022
SetThreadCount(int count)1023 void SetThreadCount(int count)
1024 {
1025 if (count > 1) log_info("WARNING: SetThreadCount(%d) ignored\n", count);
1026 }
1027
1028 #endif
1029