• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2    A C-program for MT19937, with initialization improved 2002/1/26.
3    Coded by Takuji Nishimura and Makoto Matsumoto.
4 
5    Before using, initialize the state by using init_genrand(seed)
6    or init_by_array(init_key, key_length).
7 
8    Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
9    All rights reserved.
10 
11    Redistribution and use in source and binary forms, with or without
12    modification, are permitted provided that the following conditions
13    are met:
14 
15      1. Redistributions of source code must retain the above copyright
16         notice, this list of conditions and the following disclaimer.
17 
18      2. Redistributions in binary form must reproduce the above copyright
19         notice, this list of conditions and the following disclaimer in the
20         documentation and/or other materials provided with the distribution.
21 
22      3. The names of its contributors may not be used to endorse or promote
23         products derived from this software without specific prior written
24         permission.
25 
26    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
30    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 
38 
39    Any feedback is very welcome.
40    http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
41    email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
42 
43    Modifications for use in OpenCL by Ian Ollmann, Apple Inc.
44 
45 */
46 
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include "mt19937.h"
50 #include "mingw_compat.h"
51 #include "harness/alloc.h"
52 
53 #ifdef __SSE2__
54 #include <emmintrin.h>
55 #endif
56 
57 /* Period parameters */
58 #define N 624 /* vector code requires multiple of 4 here */
59 #define M 397
60 #define MATRIX_A (cl_uint)0x9908b0dfUL /* constant vector a */
61 #define UPPER_MASK (cl_uint)0x80000000UL /* most significant w-r bits */
62 #define LOWER_MASK (cl_uint)0x7fffffffUL /* least significant r bits */
63 
64 typedef struct _MTdata
65 {
66     cl_uint mt[N];
67 #ifdef __SSE2__
68     cl_uint cache[N];
69 #endif
70     cl_int mti;
71 } _MTdata;
72 
73 /* initializes mt[N] with a seed */
init_genrand(cl_uint s)74 MTdata init_genrand(cl_uint s)
75 {
76     MTdata r = (MTdata)align_malloc(sizeof(_MTdata), 16);
77     if (NULL != r)
78     {
79         cl_uint *mt = r->mt;
80         int mti = 0;
81         mt[0] = s; // & 0xffffffffUL;
82         for (mti = 1; mti < N; mti++)
83         {
84             mt[mti] = (cl_uint)(
85                 1812433253UL * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
86             /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
87             /* In the previous versions, MSBs of the seed affect   */
88             /* only MSBs of the array mt[].                        */
89             /* 2002/01/09 modified by Makoto Matsumoto             */
90             // mt[mti] &= 0xffffffffUL;
91             /* for >32 bit machines */
92         }
93         r->mti = mti;
94     }
95 
96     return r;
97 }
98 
free_mtdata(MTdata d)99 void free_mtdata(MTdata d)
100 {
101     if (d) align_free(d);
102 }
103 
104 /* generates a random number on [0,0xffffffff]-interval */
genrand_int32(MTdata d)105 cl_uint genrand_int32(MTdata d)
106 {
107     /* mag01[x] = x * MATRIX_A  for x=0,1 */
108     static const cl_uint mag01[2] = { 0x0UL, MATRIX_A };
109 #ifdef __SSE2__
110     static volatile int init = 0;
111     static union {
112         __m128i v;
113         cl_uint s[4];
114     } upper_mask, lower_mask, one, matrix_a, c0, c1;
115 #endif
116 
117 
118     cl_uint *mt = d->mt;
119     cl_uint y;
120 
121     if (d->mti == N)
122     { /* generate N words at one time */
123         int kk;
124 
125 #ifdef __SSE2__
126         if (0 == init)
127         {
128             upper_mask.s[0] = upper_mask.s[1] = upper_mask.s[2] =
129                 upper_mask.s[3] = UPPER_MASK;
130             lower_mask.s[0] = lower_mask.s[1] = lower_mask.s[2] =
131                 lower_mask.s[3] = LOWER_MASK;
132             one.s[0] = one.s[1] = one.s[2] = one.s[3] = 1;
133             matrix_a.s[0] = matrix_a.s[1] = matrix_a.s[2] = matrix_a.s[3] =
134                 MATRIX_A;
135             c0.s[0] = c0.s[1] = c0.s[2] = c0.s[3] = (cl_uint)0x9d2c5680UL;
136             c1.s[0] = c1.s[1] = c1.s[2] = c1.s[3] = (cl_uint)0xefc60000UL;
137             init = 1;
138         }
139 #endif
140 
141         kk = 0;
142 #ifdef __SSE2__
143         // vector loop
144         for (; kk + 4 <= N - M; kk += 4)
145         {
146             // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
147             __m128i vy = _mm_or_si128(
148                 _mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)),
149                               upper_mask.v),
150                 _mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)),
151                               lower_mask.v));
152 
153             // y & 1 ? -1 : 0
154             __m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v);
155             // y & 1 ? MATRIX_A, 0    =  mag01[y & (cl_uint) 0x1UL]
156             __m128i vmag01 = _mm_and_si128(mask, matrix_a.v);
157             // mt[kk+M] ^ (y >> 1)
158             __m128i vr =
159                 _mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M)),
160                               (__m128i)_mm_srli_epi32(vy, 1));
161             // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
162             vr = _mm_xor_si128(vr, vmag01);
163             _mm_store_si128((__m128i *)(mt + kk), vr);
164         }
165 #endif
166         for (; kk < N - M; kk++)
167         {
168             y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
169             mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
170         }
171 
172 #ifdef __SSE2__
173         // advance to next aligned location
174         for (; kk < N - 1 && (kk & 3); kk++)
175         {
176             y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
177             mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
178         }
179 
180         // vector loop
181         for (; kk + 4 <= N - 1; kk += 4)
182         {
183             __m128i vy = _mm_or_si128(
184                 _mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)),
185                               upper_mask.v),
186                 // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
187                 _mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)),
188                               lower_mask.v));
189 
190             // y & 1 ? -1 : 0
191             __m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v);
192             // y & 1 ? MATRIX_A, 0    =  mag01[y & (cl_uint) 0x1UL]
193             __m128i vmag01 = _mm_and_si128(mask, matrix_a.v);
194             // mt[kk+M-N] ^ (y >> 1)
195             __m128i vr =
196                 _mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M - N)),
197                               _mm_srli_epi32(vy, 1));
198             // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
199             vr = _mm_xor_si128(vr, vmag01);
200             _mm_store_si128((__m128i *)(mt + kk), vr);
201         }
202 #endif
203 
204         for (; kk < N - 1; kk++)
205         {
206             y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
207             mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
208         }
209         y = (cl_uint)((mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK));
210         mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
211 
212 #ifdef __SSE2__
213         // Do the tempering ahead of time in vector code
214         for (kk = 0; kk + 4 <= N; kk += 4)
215         {
216             // y = mt[k];
217             __m128i vy = _mm_load_si128((__m128i *)(mt + kk));
218             // y ^= (y >> 11);
219             vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 11));
220             // y ^= (y << 7) & (cl_uint) 0x9d2c5680UL;
221             vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 7), c0.v));
222             // y ^= (y << 15) & (cl_uint) 0xefc60000UL;
223             vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 15), c1.v));
224             // y ^= (y >> 18);
225             vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 18));
226             _mm_store_si128((__m128i *)(d->cache + kk), vy);
227         }
228 #endif
229 
230         d->mti = 0;
231     }
232 #ifdef __SSE2__
233     y = d->cache[d->mti++];
234 #else
235     y = mt[d->mti++];
236 
237     /* Tempering */
238     y ^= (y >> 11);
239     y ^= (y << 7) & (cl_uint)0x9d2c5680UL;
240     y ^= (y << 15) & (cl_uint)0xefc60000UL;
241     y ^= (y >> 18);
242 #endif
243 
244 
245     return y;
246 }
247 
genrand_int64(MTdata d)248 cl_ulong genrand_int64(MTdata d)
249 {
250     return ((cl_ulong)genrand_int32(d) << 32) | (cl_uint)genrand_int32(d);
251 }
252 
253 /* generates a random number on [0,1]-real-interval */
genrand_real1(MTdata d)254 double genrand_real1(MTdata d)
255 {
256     return genrand_int32(d) * (1.0 / 4294967295.0);
257     /* divided by 2^32-1 */
258 }
259 
260 /* generates a random number on [0,1)-real-interval */
genrand_real2(MTdata d)261 double genrand_real2(MTdata d)
262 {
263     return genrand_int32(d) * (1.0 / 4294967296.0);
264     /* divided by 2^32 */
265 }
266 
267 /* generates a random number on (0,1)-real-interval */
genrand_real3(MTdata d)268 double genrand_real3(MTdata d)
269 {
270     return (((double)genrand_int32(d)) + 0.5) * (1.0 / 4294967296.0);
271     /* divided by 2^32 */
272 }
273 
274 /* generates a random number on [0,1) with 53-bit resolution*/
genrand_res53(MTdata d)275 double genrand_res53(MTdata d)
276 {
277     unsigned long a = genrand_int32(d) >> 5, b = genrand_int32(d) >> 6;
278     return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0);
279 }
280 
genrand_bool(MTdata d)281 bool genrand_bool(MTdata d) { return ((cl_uint)genrand_int32(d) & 1); }
282