Concurrently initializing many arrays with random numbers using Curand and CUDA kernel
我正在尝试在GPU上同时使用随机生成的数字初始化每个并行数组的100个元素。但是,我的例行程序不会产生各种随机数。当我在Visual Studio中调试代码时,我看到数组中每个元素都有一个数字。这段代码的目的是优化CImg FilledTriangles例程,以在可能的地方使用GPU。
我在做什么错,我该如何解决?这是我的代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | __global__ void initCurand(curandState* state, unsigned long seed) int idx = threadIdx.x + blockIdx.x * blockDim.x; curand_init(seed, idx, 0, &state[idx]); __syncthreads(); } /* * CUDA kernel that will execute 100 threads in parallel */ __global__ void initializeArrays(float* posx, float* posy,float* rayon, float* veloc, float* opacity ,float * angle, unsigned char** color, int height, int width, curandState* state){ int idx = threadIdx.x + blockIdx.x * blockDim.x; curandState localState = state[idx]; __syncthreads(); posx[idx] = (float)(curand_uniform(&localState)*width); posy[idx] = (float)(curand_uniform(&localState)*height); rayon[idx] = (float)(10 + curand_uniform(&localState)*50); angle[idx] = (float)(curand_uniform(&localState)*360); veloc[idx] = (float)(curand_uniform(&localState)*20 - 10); color[idx][0] = (unsigned char)(curand_uniform(&localState)*255); color[idx][1] = (unsigned char)(curand_uniform(&localState)*255); color[idx][2] = (unsigned char)(curand_uniform(&localState)*255); opacity[idx] = (float)(0.3 + 1.5*curand_uniform(&localState)); } |
这是准备并调用这些内核的主机代码:我正在尝试在网格的一个块上创建100个线程(每个元素)。
1 2 3 4 5 6 7 8 | // launch grid of threads dim3 dimBlock(100); dim3 dimGrid(1); initCurand<<<dimBlock,dimGrid>>>(devState, unsigned(time(nullptr))); // synchronize the device and the host cudaDeviceSynchronize(); initializeArrays<<<dimBlock, dimGrid>>>(d_posx, d_posy, d_rayon, d_veloc, d_opacity, d_angle,d_color, img0.height(), img0.width(), devState); |
预备赛:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | // Define random properties (pos, size, colors, ..) for all triangles that will be displayed. float posx[100], posy[100], rayon[100], angle[100], veloc[100], opacity[100]; // Define the same properties but for the device float* d_posx; float* d_posy; float* d_rayon; float* d_angle; float* d_veloc; float* d_opacity; //unsigned char d_color[100][3]; unsigned char** d_color; curandState* devState; cudaError_t err; // allocate memory on the device for the device arrays err = cudaMalloc((void**)&d_posx, 100 * sizeof(float)); err = cudaMalloc((void**)&d_posy, 100 * sizeof(float)); err = cudaMalloc((void**)&d_rayon, 100 * sizeof(float)); err = cudaMalloc((void**)&d_angle, 100 * sizeof(float)); err = cudaMalloc((void**)&d_veloc, 100 * sizeof(float)); err = cudaMalloc((void**)&d_opacity, 100 * sizeof(float)); err = cudaMalloc((void**)&devState, 100*sizeof(curandState)); errCheck(err); size_t pitch; //allocated the device memory for source array err = cudaMallocPitch(&d_color, &pitch, 3 * sizeof(unsigned char),100); |
获得结果:
1 2 3 4 5 6 7 8 | // get the populated arrays back to the host for use err = cudaMemcpy(posx,d_posx, 100 * sizeof(float), cudaMemcpyDeviceToHost); err = cudaMemcpy(posy,d_posy, 100 * sizeof(float), cudaMemcpyDeviceToHost); err = cudaMemcpy(rayon,d_rayon, 100 * sizeof(float), cudaMemcpyDeviceToHost); err = cudaMemcpy(veloc,d_veloc, 100 * sizeof(float), cudaMemcpyDeviceToHost); err = cudaMemcpy(opacity,d_opacity, 100 * sizeof(float), cudaMemcpyDeviceToHost); err = cudaMemcpy(angle,d_angle, 100 * sizeof(float), cudaMemcpyDeviceToHost); err = cudaMemcpy2D(color,pitch,d_color,100, 100 *sizeof(unsigned char),3, cudaMemcpyDeviceToHost); |
您肯定需要对此进行更改:
1 | err = cudaMalloc((void**)&devState, 100*sizeof(float)); |
为此:
1 | err = cudaMalloc((void**)&devState, 100*sizeof(curandState)); |
如果您通过cuda-memcheck运行代码,您将发现这一点。因此,您的initCurand内核具有大量的出界访问。
您还应该对所有cuda调用和所有内核启动进行错误检查。我相信您的第二次内核调用由于
通常,当我们使用
我能够通过进行以下更改来修复它:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | __global__ void initializeArrays(float* posx, float* posy,float* rayon, float* veloc, float* opacity,float * angle, unsigned char* color, int height, int width, curandState* state, size_t pitch){ int idx = threadIdx.x + blockIdx.x * blockDim.x; curandState localState = state[idx]; __syncthreads(); posx[idx] = (float)(curand_uniform(&localState)*width); posy[idx] = (float)(curand_uniform(&localState)*height); rayon[idx] = (float)(10 + curand_uniform(&localState)*50); angle[idx] = (float)(curand_uniform(&localState)*360); veloc[idx] = (float)(curand_uniform(&localState)*20 - 10); color[idx*pitch] = (unsigned char)(curand_uniform(&localState)*255); color[(idx*pitch)+1] = (unsigned char)(curand_uniform(&localState)*255); color[(idx*pitch)+2] = (unsigned char)(curand_uniform(&localState)*255); opacity[idx] = (float)(0.3 + 1.5*curand_uniform(&localState)); } |
和
1 | initializeArrays<<<dimBlock, dimGrid>>>(d_posx, d_posy, d_rayon, d_veloc, d_opacity, d_angle,d_color, img0.height(), img0.width(), devState, pitch); |
和
1 | unsigned char* d_color; |
通过
这些更改,我能够消除发现的错误,并且代码吐出了各种随机值。我尚未检查所有值,但这应该可以帮助您开始。