关于并行处理：同时使用Curand和CUDA内核初始化许多具有随机数的数组

Concurrently initializing many arrays with random numbers using Curand and CUDA kernel

我正在尝试在GPU上同时使用随机生成的数字初始化每个并行数组的100个元素。但是，我的例行程序不会产生各种随机数。当我在Visual Studio中调试代码时，我看到数组中每个元素都有一个数字。这段代码的目的是优化CImg FilledTriangles例程，以在可能的地方使用GPU。

我在做什么错，我该如何解决？这是我的代码：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

__global__ void initCurand(curandState* state, unsigned long seed)
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
__syncthreads();
}

/*
* CUDA kernel that will execute 100 threads in parallel
*/

__global__ void initializeArrays(float* posx, float* posy,float* rayon, float* veloc, float* opacity
,float * angle, unsigned char** color, int height, int width, curandState* state){

int idx = threadIdx.x + blockIdx.x * blockDim.x;

curandState localState = state[idx];
__syncthreads();

posx[idx] = (float)(curand_uniform(&localState)*width);
posy[idx] = (float)(curand_uniform(&localState)*height);
rayon[idx] = (float)(10 + curand_uniform(&localState)*50);
angle[idx] = (float)(curand_uniform(&localState)*360);
veloc[idx] = (float)(curand_uniform(&localState)*20 - 10);
color[idx][0] = (unsigned char)(curand_uniform(&localState)*255);
color[idx][1] = (unsigned char)(curand_uniform(&localState)*255);
color[idx][2] = (unsigned char)(curand_uniform(&localState)*255);
opacity[idx] = (float)(0.3 + 1.5*curand_uniform(&localState));
}

这是准备并调用这些内核的主机代码：我正在尝试在网格的一个块上创建100个线程(每个元素)。

1
2
3
4
5
6
7
8

// launch grid of threads
dim3 dimBlock(100);
dim3 dimGrid(1);

initCurand<<<dimBlock,dimGrid>>>(devState, unsigned(time(nullptr)));
// synchronize the device and the host
cudaDeviceSynchronize();
initializeArrays<<<dimBlock, dimGrid>>>(d_posx, d_posy, d_rayon, d_veloc, d_opacity, d_angle,d_color, img0.height(), img0.width(), devState);

预备赛：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

// Define random properties (pos, size, colors, ..) for all triangles that will be displayed.
float posx[100], posy[100], rayon[100], angle[100], veloc[100], opacity[100];
// Define the same properties but for the device
float* d_posx;
float* d_posy;
float* d_rayon;
float* d_angle;
float* d_veloc;
float* d_opacity;
//unsigned char d_color[100][3];
unsigned char** d_color;
curandState* devState;
cudaError_t err;

// allocate memory on the device for the device arrays
err = cudaMalloc((void**)&d_posx, 100 * sizeof(float));
err = cudaMalloc((void**)&d_posy, 100 * sizeof(float));
err = cudaMalloc((void**)&d_rayon, 100 * sizeof(float));
err = cudaMalloc((void**)&d_angle, 100 * sizeof(float));
err = cudaMalloc((void**)&d_veloc, 100 * sizeof(float));
err = cudaMalloc((void**)&d_opacity, 100 * sizeof(float));
err = cudaMalloc((void**)&devState, 100*sizeof(curandState));
errCheck(err);
size_t pitch;
//allocated the device memory for source array
err = cudaMallocPitch(&d_color, &pitch, 3 * sizeof(unsigned char),100);

获得结果：

1
2
3
4
5
6
7
8

// get the populated arrays back to the host for use
err = cudaMemcpy(posx,d_posx, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy(posy,d_posy, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy(rayon,d_rayon, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy(veloc,d_veloc, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy(opacity,d_opacity, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy(angle,d_angle, 100 * sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy2D(color,pitch,d_color,100, 100 *sizeof(unsigned char),3, cudaMemcpyDeviceToHost);

您肯定需要对此进行更改：

1	err = cudaMalloc((void*)&devState, 100sizeof(float));

为此：

1	err = cudaMalloc((void*)&devState, 100sizeof(curandState));

如果您通过cuda-memcheck运行代码，您将发现这一点。因此，您的initCurand内核具有大量的出界访问。

您还应该对所有cuda调用和所有内核启动进行错误检查。我相信您的第二次内核调用由于color[][]阵列上的混乱操作而失败。

通常，当我们使用cudaMallocPitch创建数组时，我们需要使用pitch参数进行访问。 C双重下标的数组本身不起作用，因为C并不了解实际的数组宽度。

我能够通过进行以下更改来修复它：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

__global__ void initializeArrays(float* posx, float* posy,float* rayon, float* veloc, float* opacity,float * angle, unsigned char* color, int height, int width, curandState* state, size_t pitch){

int idx = threadIdx.x + blockIdx.x * blockDim.x;

curandState localState = state[idx];
__syncthreads();

posx[idx] = (float)(curand_uniform(&localState)*width);
posy[idx] = (float)(curand_uniform(&localState)*height);
rayon[idx] = (float)(10 + curand_uniform(&localState)*50);
angle[idx] = (float)(curand_uniform(&localState)*360);
veloc[idx] = (float)(curand_uniform(&localState)*20 - 10);
color[idx*pitch] = (unsigned char)(curand_uniform(&localState)*255);
color[(idx*pitch)+1] = (unsigned char)(curand_uniform(&localState)*255);
color[(idx*pitch)+2] = (unsigned char)(curand_uniform(&localState)*255);
opacity[idx] = (float)(0.3 + 1.5*curand_uniform(&localState));
}

和

1	initializeArrays<<<dimBlock, dimGrid>>>(d_posx, d_posy, d_rayon, d_veloc, d_opacity, d_angle,d_color, img0.height(), img0.width(), devState, pitch);

和

1	unsigned char* d_color;

通过

这些更改，我能够消除发现的错误，并且代码吐出了各种随机值。我尚未检查所有值，但这应该可以帮助您开始。