forked from trantalaiho/Cuda-Histogram
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcuda_rnd.h
155 lines (135 loc) · 5.35 KB
/
cuda_rnd.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/*
* cuda_rnd.h
*
* Created on: 31.3.2012
* Author: Teemu Rantalaiho (teemu.rantalaiho@helsinki.fi)
*
*
* Copyright 2012 Teemu Rantalaiho
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Uses WarpStandard Algorithm by David B. Thomas (see warp_standard.h)
*
*/
#ifndef CUDA_RND_H_
#define CUDA_RND_H_
#include "warp_standard.h"
#include <cuda_runtime_api.h>
#include <stdio.h>
// Instructions: just use the three macros below and get good online random numbers
// Pre-generating
// Note NTHREADS_PER_BLOCK has to be compile-time constant (and multiple of 32) for
// this to work!
// TODO: Experimental CODE! Quality of random numbers not yet tested, so expect bugs here!
__shared__ unsigned int * s_warp_standard_shmem;
__device__ unsigned int * s_warp_standard_regs;
// Turn this flag on if you have divergent threads within a warp! DEFAULT: OFF
__shared__ bool s_warp_safe_mode;
#define INIT_WS_STATE(STATEPTR, NTHREADS_PER_BLOCK) \
unsigned int tmpregs[WarpStandard_REG_COUNT]; \
__shared__ unsigned int tmpshs[NTHREADS_PER_BLOCK]; \
WarpStandard_LoadState(STATEPTR, tmpregs, tmpshs); \
s_warp_standard_shmem = tmpshs; \
s_warp_safe_mode = false; \
s_warp_standard_regs = tmpregs
#ifndef __CUDA_ARCH__
#define GENERATE_WS_RND(X) 0
#define SET_SAFE_MODE(FLAG)
#else
#define GENERATE_WS_RND(X) WarpStandard_Generate(s_warp_standard_regs, s_warp_standard_shmem, s_warp_safe_mode)
#define SET_SAFE_MODE(FLAG) s_warp_safe_mode = (FLAG);
#endif
#define SAVE_WS_STATE(STATEPTR) WarpStandard_SaveState(s_warp_standard_regs, s_warp_standard_shmem, STATEPTR)
// Alternatively just use the functions:
// uses either /dev/urandom or lrand48() (Latter will have worse quality)
static unsigned int* seedRnds(bool useDevRandom, int nThreadsTot);
unsigned int* seedRnds(bool useDevRandom, int nThreadsTot)
{
unsigned int* result;
unsigned int* h_result;
cudaMalloc(&result, nThreadsTot * sizeof(unsigned int));
h_result = (unsigned int*)malloc(nThreadsTot * sizeof(unsigned int));
if (!result || !h_result)
return result;
if (useDevRandom){
FILE* rndf = fopen("/dev/urandom", "r");
for (int i = 0; i < nThreadsTot; i++)
if (fread(&h_result[i], sizeof(unsigned int), 1, rndf) != 1){
fclose(rndf);
cudaFree(result);
return NULL;
}
fclose(rndf);
} else {
for (int i = 0; i < nThreadsTot; i++){
unsigned int rnd1 = (unsigned int)lrand48();
unsigned int rnd2 = (unsigned int)lrand48();
h_result[i] = ((rnd1 & 0x00FFFF00u) << 8) | ((rnd2 & 0x00FFFF00u) >> 8);
}
}
cudaMemcpy(result, h_result, nThreadsTot * sizeof(unsigned int), cudaMemcpyHostToDevice);
free(h_result);
return result;
}
#define GEN_RND_BLOCKSIZE_LOG2 7
#define GEN_RND_BLOCKSIZE (1 << GEN_RND_BLOCKSIZE_LOG2)
#define GEN_RND_MAXBLOCKS_LOG2 6
#define GEN_RND_MAXBLOCKS (1 << GEN_RND_MAXBLOCKS_LOG2)
__global__
void cuda_rndgenKernel(unsigned int* output, unsigned int* state, size_t nOutputs ,int nsteps) {
size_t idx = (size_t)(blockIdx.x * blockDim.x + threadIdx.x);
unsigned int tmpregs[WarpStandard_REG_COUNT];
__shared__ unsigned int tmpshs[GEN_RND_BLOCKSIZE];
WarpStandard_LoadState(state, tmpregs, tmpshs);
for (int step = 0; step < nsteps; step++){
unsigned int rndout = WarpStandard_Generate(tmpregs, tmpshs, false);
if (idx < nOutputs){
output[idx] = rndout;
}
idx += blockDim.x * gridDim.x;
}
WarpStandard_SaveState(tmpregs, tmpshs, state);
}
static unsigned int* cudaGenerateRnds(
size_t nOutputs,
unsigned int* result = NULL,
bool newseed = false,
bool useDevRndSeed = true,
int seed = 0,
cudaStream_t stream = 0)
{
static unsigned int* state = NULL;
if (!state || newseed){
if (state) cudaFree(state);
state = seedRnds(useDevRndSeed, GEN_RND_BLOCKSIZE * GEN_RND_MAXBLOCKS);
if (!state) return NULL;
}
if (nOutputs <= 0)
return NULL;
const dim3 block = GEN_RND_BLOCKSIZE;
dim3 grid = nOutputs >> ( GEN_RND_BLOCKSIZE_LOG2 );
int steps = 1;
if (grid.x > GEN_RND_MAXBLOCKS){
grid.x = GEN_RND_MAXBLOCKS;
steps = nOutputs >> (GEN_RND_BLOCKSIZE_LOG2 + GEN_RND_MAXBLOCKS_LOG2);
if (steps << (GEN_RND_BLOCKSIZE_LOG2 + GEN_RND_MAXBLOCKS_LOG2) < nOutputs) steps++;
}
if (!result){
cudaMalloc(&result, sizeof(unsigned int) * nOutputs);
if (!result) return result;
}
cuda_rndgenKernel<<<grid, block, 0, stream>>>(result, state, nOutputs, steps);
return result;
}
#endif /* CUDA_RND_H_ */