CUDA error with processing the image
我正在尝试获取黑白图像作为输出,而彩色图像作为输入。我正在使用OpenCV来获取图像并写入输出,并使用CUDA来使图像在内核中变成黑白图像。我尝试了相同的代码,但没有使用OpenCV,并且工作正常。但是现在的输出与我真正期望的输出略有不同。
我认为CUDA代码需要进行一些修改才能与OpenCV一起使用。我做了一些工作,但没有找到解决方法。也许有人可以给我建议或修改我的代码?我真的很困惑这个问题。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | __global__ void addMatrix(uchar4 *DataIn, unsigned char *DataOut) { int idx = blockIdx.x * blockDim.x + threadIdx.x; DataOut[idx] = (DataIn[idx].x + DataIn[idx].y + DataIn[idx].z)/3; } int main() { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); char* c =""; printf("Input source of image\ Example of right directory file: E:\\henrik-evensen-castle-valley-v03.jpg\ Your turn:\ "); char *tbLEN; tbLEN = new char [1024]; cin.getline(tbLEN,1024); cout<< endl <<"Your image:" << tbLEN << endl; //Data for input image IplImage* image; image = cvLoadImage(tbLEN, 1); int height = image->height; int width = image->width; int step = image->widthStep; int SizeIn = (step*height); printf("\ Processing image\ "); //Data for output image IplImage *image2 = cvCreateImage(cvSize(width, height), IPL_DEPTH_8U, 1); int step2 = image2->widthStep; int SizeOut = step2 * height; //GPU uchar4* DatIn = (uchar4*)image->imageData; unsigned char* DatOut = (unsigned char*)image2->imageData; uchar4 *datIndev; unsigned char *datOutdev; printf("Allocating memory on Device\ "); /* Allocate memory on Device */ cudaMalloc(&datIndev, SizeIn * sizeof(unsigned char)); cudaMalloc(&datOutdev, SizeOut * sizeof(unsigned char)); printf("Copy data on Device\ "); /* Copy data on Device */ cudaMemcpy(datIndev, DatIn, SizeIn * sizeof(unsigned char), cudaMemcpyHostToDevice); cudaMemcpy(datOutdev, DatOut, SizeOut * sizeof(unsigned char), cudaMemcpyHostToDevice); int NumThreadsX = deviceProp.maxThreadsPerBlock; int NumBlocksX = (width * height)/NumThreadsX; dim3 blocks(NumBlocksX, 1, 1); dim3 threads(NumThreadsX, 1, 1); addMatrix <<< blocks, threads >>> (datIndev, datOutdev); cudaMemcpy(DatOut, datOutdev, SizeOut * sizeof(unsigned char), cudaMemcpyDeviceToHost); cvNamedWindow("Imagecolor"); cvShowImage("Imagecolor", image); cvNamedWindow("Gray"); cvShowImage("Gray", image2); const char* filename1 ="CcPwSwMW4AELPUc.jpg"; printf("Saving an output image\ "); cvSaveImage( filename1, image2 ); cudaFree(datOutdev); cudaFree(datIndev); cvWaitKey(0); return 0; } |
这里有几个问题:
您关于四个通道数据的假设是不正确的。您的代码将从文件中将三通道BGR图像加载到内存中。因此,您需要将引用从
您的内核本身包含潜在的算术错误。三个无符号char像素值的总和可能会溢出一个无符号char中间结果并产生不正确的平均值。您应该使用更大的类型进行计算。
总的来说,您的内核应如下所示:
1 2 3 4 5 6 7 8 9 | __global__ void addMatrix(unsigned char *DataIn, unsigned char *DataOut) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int b = DataIn[3*idx]; int g = DataIn[3*idx+1]; int r = DataIn[3*idx+2]; DataOut[idx] = (unsigned char)((b + r + g)/3); } |
然后您可能会发现图像看起来正确。