1 /*
2 A C-program for MT19937, with initialization improved 2002/1/26.
3 Coded by Takuji Nishimura and Makoto Matsumoto.
4
5 Before using, initialize the state by using init_genrand(seed)
6 or init_by_array(init_key, key_length).
7
8 Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
9 All rights reserved.
10
11 Redistribution and use in source and binary forms, with or without
12 modification, are permitted provided that the following conditions
13 are met:
14
15 1. Redistributions of source code must retain the above copyright
16 notice, this list of conditions and the following disclaimer.
17
18 2. Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 3. The names of its contributors may not be used to endorse or promote
23 products derived from this software without specific prior written
24 permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
30 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37
38
39 Any feedback is very welcome.
40 http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
41 email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
42
43 Modifications for use in OpenCL by Ian Ollmann, Apple Inc.
44
45 */
46
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include "mt19937.h"
50 #include "mingw_compat.h"
51 #include "harness/alloc.h"
52
53 #ifdef __SSE2__
54 #include <emmintrin.h>
55 #endif
56
57 /* Period parameters */
58 #define N 624 /* vector code requires multiple of 4 here */
59 #define M 397
60 #define MATRIX_A (cl_uint)0x9908b0dfUL /* constant vector a */
61 #define UPPER_MASK (cl_uint)0x80000000UL /* most significant w-r bits */
62 #define LOWER_MASK (cl_uint)0x7fffffffUL /* least significant r bits */
63
64 typedef struct _MTdata
65 {
66 cl_uint mt[N];
67 #ifdef __SSE2__
68 cl_uint cache[N];
69 #endif
70 cl_int mti;
71 } _MTdata;
72
73 /* initializes mt[N] with a seed */
init_genrand(cl_uint s)74 MTdata init_genrand(cl_uint s)
75 {
76 MTdata r = (MTdata)align_malloc(sizeof(_MTdata), 16);
77 if (NULL != r)
78 {
79 cl_uint *mt = r->mt;
80 int mti = 0;
81 mt[0] = s; // & 0xffffffffUL;
82 for (mti = 1; mti < N; mti++)
83 {
84 mt[mti] = (cl_uint)(
85 1812433253UL * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
86 /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
87 /* In the previous versions, MSBs of the seed affect */
88 /* only MSBs of the array mt[]. */
89 /* 2002/01/09 modified by Makoto Matsumoto */
90 // mt[mti] &= 0xffffffffUL;
91 /* for >32 bit machines */
92 }
93 r->mti = mti;
94 }
95
96 return r;
97 }
98
free_mtdata(MTdata d)99 void free_mtdata(MTdata d)
100 {
101 if (d) align_free(d);
102 }
103
104 /* generates a random number on [0,0xffffffff]-interval */
genrand_int32(MTdata d)105 cl_uint genrand_int32(MTdata d)
106 {
107 /* mag01[x] = x * MATRIX_A for x=0,1 */
108 static const cl_uint mag01[2] = { 0x0UL, MATRIX_A };
109 #ifdef __SSE2__
110 static volatile int init = 0;
111 static union {
112 __m128i v;
113 cl_uint s[4];
114 } upper_mask, lower_mask, one, matrix_a, c0, c1;
115 #endif
116
117
118 cl_uint *mt = d->mt;
119 cl_uint y;
120
121 if (d->mti == N)
122 { /* generate N words at one time */
123 int kk;
124
125 #ifdef __SSE2__
126 if (0 == init)
127 {
128 upper_mask.s[0] = upper_mask.s[1] = upper_mask.s[2] =
129 upper_mask.s[3] = UPPER_MASK;
130 lower_mask.s[0] = lower_mask.s[1] = lower_mask.s[2] =
131 lower_mask.s[3] = LOWER_MASK;
132 one.s[0] = one.s[1] = one.s[2] = one.s[3] = 1;
133 matrix_a.s[0] = matrix_a.s[1] = matrix_a.s[2] = matrix_a.s[3] =
134 MATRIX_A;
135 c0.s[0] = c0.s[1] = c0.s[2] = c0.s[3] = (cl_uint)0x9d2c5680UL;
136 c1.s[0] = c1.s[1] = c1.s[2] = c1.s[3] = (cl_uint)0xefc60000UL;
137 init = 1;
138 }
139 #endif
140
141 kk = 0;
142 #ifdef __SSE2__
143 // vector loop
144 for (; kk + 4 <= N - M; kk += 4)
145 {
146 // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
147 __m128i vy = _mm_or_si128(
148 _mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)),
149 upper_mask.v),
150 _mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)),
151 lower_mask.v));
152
153 // y & 1 ? -1 : 0
154 __m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v);
155 // y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL]
156 __m128i vmag01 = _mm_and_si128(mask, matrix_a.v);
157 // mt[kk+M] ^ (y >> 1)
158 __m128i vr =
159 _mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M)),
160 (__m128i)_mm_srli_epi32(vy, 1));
161 // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
162 vr = _mm_xor_si128(vr, vmag01);
163 _mm_store_si128((__m128i *)(mt + kk), vr);
164 }
165 #endif
166 for (; kk < N - M; kk++)
167 {
168 y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
169 mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
170 }
171
172 #ifdef __SSE2__
173 // advance to next aligned location
174 for (; kk < N - 1 && (kk & 3); kk++)
175 {
176 y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
177 mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
178 }
179
180 // vector loop
181 for (; kk + 4 <= N - 1; kk += 4)
182 {
183 __m128i vy = _mm_or_si128(
184 _mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)),
185 upper_mask.v),
186 // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
187 _mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)),
188 lower_mask.v));
189
190 // y & 1 ? -1 : 0
191 __m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v);
192 // y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL]
193 __m128i vmag01 = _mm_and_si128(mask, matrix_a.v);
194 // mt[kk+M-N] ^ (y >> 1)
195 __m128i vr =
196 _mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M - N)),
197 _mm_srli_epi32(vy, 1));
198 // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
199 vr = _mm_xor_si128(vr, vmag01);
200 _mm_store_si128((__m128i *)(mt + kk), vr);
201 }
202 #endif
203
204 for (; kk < N - 1; kk++)
205 {
206 y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
207 mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
208 }
209 y = (cl_uint)((mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK));
210 mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
211
212 #ifdef __SSE2__
213 // Do the tempering ahead of time in vector code
214 for (kk = 0; kk + 4 <= N; kk += 4)
215 {
216 // y = mt[k];
217 __m128i vy = _mm_load_si128((__m128i *)(mt + kk));
218 // y ^= (y >> 11);
219 vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 11));
220 // y ^= (y << 7) & (cl_uint) 0x9d2c5680UL;
221 vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 7), c0.v));
222 // y ^= (y << 15) & (cl_uint) 0xefc60000UL;
223 vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 15), c1.v));
224 // y ^= (y >> 18);
225 vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 18));
226 _mm_store_si128((__m128i *)(d->cache + kk), vy);
227 }
228 #endif
229
230 d->mti = 0;
231 }
232 #ifdef __SSE2__
233 y = d->cache[d->mti++];
234 #else
235 y = mt[d->mti++];
236
237 /* Tempering */
238 y ^= (y >> 11);
239 y ^= (y << 7) & (cl_uint)0x9d2c5680UL;
240 y ^= (y << 15) & (cl_uint)0xefc60000UL;
241 y ^= (y >> 18);
242 #endif
243
244
245 return y;
246 }
247
genrand_int64(MTdata d)248 cl_ulong genrand_int64(MTdata d)
249 {
250 return ((cl_ulong)genrand_int32(d) << 32) | (cl_uint)genrand_int32(d);
251 }
252
253 /* generates a random number on [0,1]-real-interval */
genrand_real1(MTdata d)254 double genrand_real1(MTdata d)
255 {
256 return genrand_int32(d) * (1.0 / 4294967295.0);
257 /* divided by 2^32-1 */
258 }
259
260 /* generates a random number on [0,1)-real-interval */
genrand_real2(MTdata d)261 double genrand_real2(MTdata d)
262 {
263 return genrand_int32(d) * (1.0 / 4294967296.0);
264 /* divided by 2^32 */
265 }
266
267 /* generates a random number on (0,1)-real-interval */
genrand_real3(MTdata d)268 double genrand_real3(MTdata d)
269 {
270 return (((double)genrand_int32(d)) + 0.5) * (1.0 / 4294967296.0);
271 /* divided by 2^32 */
272 }
273
274 /* generates a random number on [0,1) with 53-bit resolution*/
genrand_res53(MTdata d)275 double genrand_res53(MTdata d)
276 {
277 unsigned long a = genrand_int32(d) >> 5, b = genrand_int32(d) >> 6;
278 return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0);
279 }
280
genrand_bool(MTdata d)281 bool genrand_bool(MTdata d) { return ((cl_uint)genrand_int32(d) & 1); }
282