Cuda programming cannot have the same computation accuracy compared with CPU program in terms of float type
我尝试使用GPU加速我的程序,该程序计算两个浮点数组之间的L2距离。为了检查计算精度,我同时编写了CUDA程序和CPU程序。但是,我发现总错误超过200,我不明白。我在两种情况下都使用浮点类型,我相信我应该得到相同的结果。我的代码如下。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | #include <cuda_runtime.h> #include <stdio.h> #include <sys/time.h> #include <math.h> // #include <helper_functions.h> #define VECTORDIM 3 double cpuSecond() { struct timeval tp; gettimeofday(&tp, NULL); return ((double) tp.tv_sec + (double)tp.tv_usec*1e-6); } void DistanceCPU(float* array1, float* array2, int narray1, int narray2, float* output) { float temp; for (int i = 0; i < narray1; i++) { for (int j = 0; j < narray2; j++) { temp = 0; for (int l = 0; l < VECTORDIM; l++) { temp += powf(array1[i + l * narray1] - array2[j + l * narray2], 2); } output[i * narray2 + j] = temp; } } } __global__ void DistGPU(float* array1, float* array2, int narray1, int narray2, float* output) { int i = blockDim.x * blockIdx.x + threadIdx.x; float temp; if (i < narray1) { for (int j = 0; j < narray2; j++) { temp = 0; temp += powf(array1[i] - array2[j], 2); temp += powf(array1[i + narray1] - array2[j + narray2], 2); temp += powf(array1[i + 2 * narray1] - array2[j + 2 * narray2], 2); output[i * narray2 + j] = temp; } } } int main() { int narray1 = 7000; int narray2 = 60000; float* array1 = new float[narray1 * VECTORDIM]; float* array2 = new float[narray2 * VECTORDIM]; float* outputGPU = new float[narray1 * narray2]; float* outputCPU = new float[narray1 * narray2]; float* outputCPUTest = new float[narray1 * narray2]; float* d_array1; float* d_array2; float* d_output; for (int i = 0; i < narray1 * VECTORDIM; i++) { array1[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10))); // std::cout <<"Element" << i <<"" << array1[i] << std::endl; } for (int i = 0; i < narray2 * VECTORDIM; i++) { array2[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10))); } cudaError_t err; err = cudaMalloc((void**)&d_array1, narray1 * VECTORDIM * sizeof(float)); err = cudaMalloc((void**)&d_array2, narray2 * VECTORDIM * sizeof(float)); err = cudaMalloc((void**)&d_output, narray1 * narray2 * sizeof(float)); err = cudaMemcpy(d_array1, array1, narray1 * VECTORDIM * sizeof(float), cudaMemcpyHostToDevice); err = cudaMemcpy(d_array2, array2, narray2 * VECTORDIM * sizeof(float), cudaMemcpyHostToDevice); int threadsPerBlock = 512; int blocksPerGrid = (narray1 + threadsPerBlock - 1) / threadsPerBlock; printf("CUDA kernel launch with %d blocks of %d threads\ ", blocksPerGrid, threadsPerBlock); double iStart = cpuSecond(); DistGPU<<<blocksPerGrid, threadsPerBlock>>>(d_array1, d_array2, narray1, narray2, d_output); double iElaps = cpuSecond() - iStart; err = cudaMemcpy(outputGPU, d_output, narray1 * narray2 * sizeof(float), cudaMemcpyDeviceToHost); printf("Total computation time is %lf \ " , iElaps); DistanceCPU(array1, array2, narray1, narray2, outputCPU); float error = 0; for (long i = 0; i < narray1 * narray2; i++) { error += abs(outputCPU[i] - outputGPU[i]); } error /= (narray2 * narray1); for (int i = 0; i < 20; i++) { printf("CPU result %f \ ", outputCPU[i]); printf("GPU result %f \ ", outputGPU[i]); } printf("Error is %f \ ", error); delete [] array1; delete [] array2; delete [] outputCPU; delete [] outputGPU; return 0; } |
我尝试从CPU和GPU上打印一些计算结果。我得到以下输出。
1 2 3 4 5 6 7 8 | CPU result 84.315201 GPU result 84.315193 CPU result 48.804039 GPU result 48.804039 CPU result 26.388403 GPU result 26.388403 CPU result 150.009735 GPU result 150.009750 |
我认为浮动精度足够了,我不知道真正的问题是什么。
我想说的主要贡献者是
但是,如果您对性能感兴趣,我认为使用
如果我们将
在CUDA 10.0,Tesla P100,CentOS 7,gcc 4.8.5上按原样运行代码的结果:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | $ ./t415 CUDA kernel launch with 14 blocks of 512 threads Total computation time is 0.000038 CPU result 28.795628 GPU result 28.795628 CPU result 50.995567 GPU result 50.995567 CPU result 46.970348 GPU result 46.970345 CPU result 29.031254 GPU result 29.031254 CPU result 111.297745 GPU result 111.297745 CPU result 19.145151 GPU result 19.145151 CPU result 20.508183 GPU result 20.508183 CPU result 133.916077 GPU result 133.916077 CPU result 84.315201 GPU result 84.315193 CPU result 48.804039 GPU result 48.804039 CPU result 26.388403 GPU result 26.388403 CPU result 150.009735 GPU result 150.009750 CPU result 108.421936 GPU result 108.421936 CPU result 73.092339 GPU result 73.092339 CPU result 79.486023 GPU result 79.486023 CPU result 89.990150 GPU result 89.990150 CPU result 20.142567 GPU result 20.142567 CPU result 43.482445 GPU result 43.482445 CPU result 29.460800 GPU result 29.460800 CPU result 86.545860 GPU result 86.545860 Error is 0.000001 |
修改后的代码,用普通平方替换powf:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | $ cat t415.cu #include <cuda_runtime.h> #include <stdio.h> #include <sys/time.h> #include <math.h> // #include <helper_functions.h> #define VECTORDIM 3 typedef float mt; double cpuSecond() { struct timeval tp; gettimeofday(&tp, NULL); return ((double) tp.tv_sec + (double)tp.tv_usec*1e-6); } void DistanceCPU(mt* array1, mt* array2, int narray1, int narray2, mt* output) { mt temp; for (int i = 0; i < narray1; i++) { for (int j = 0; j < narray2; j++) { temp = 0; for (int l = 0; l < VECTORDIM; l++) { #ifndef USE_POW temp += (array1[i + l * narray1] - array2[j + l * narray2])*(array1[i + l * narray1] - array2[j + l * narray2]); #else temp += powf(array1[i + l * narray1] - array2[j + l * narray2], 2); #endif } output[i * narray2 + j] = temp; } } } __global__ void DistGPU(mt* array1, mt* array2, int narray1, int narray2, mt* output) { int i = blockDim.x * blockIdx.x + threadIdx.x; mt temp; if (i < narray1) { for (int j = 0; j < narray2; j++) { temp = 0; #ifndef USE_POW temp += (array1[i] - array2[j])*(array1[i] - array2[j]); temp += (array1[i + narray1] - array2[j + narray2])*(array1[i + narray1] - array2[j + narray2]); temp += (array1[i + 2 * narray1] - array2[j + 2 * narray2])*(array1[i + 2 * narray1] - array2[j + 2 * narray2]); #else temp += powf(array1[i] - array2[j], 2); temp += powf(array1[i + narray1] - array2[j + narray2], 2); temp += powf(array1[i + 2 * narray1] - array2[j + 2 * narray2], 2); #endif output[i * narray2 + j] = temp; } } } int main() { int narray1 = 7000; int narray2 = 60000; mt* array1 = new mt[narray1 * VECTORDIM]; mt* array2 = new mt[narray2 * VECTORDIM]; mt* outputGPU = new mt[narray1 * narray2]; mt* outputCPU = new mt[narray1 * narray2]; mt* outputCPUTest = new mt[narray1 * narray2]; mt* d_array1; mt* d_array2; mt* d_output; for (int i = 0; i < narray1 * VECTORDIM; i++) { array1[i] = static_cast<mt> (rand() / (static_cast<mt> (RAND_MAX / 10))); // std::cout <<"Element" << i <<"" << array1[i] << std::endl; } for (int i = 0; i < narray2 * VECTORDIM; i++) { array2[i] = static_cast<mt> (rand() / (static_cast<mt> (RAND_MAX / 10))); } cudaError_t err; err = cudaMalloc((void**)&d_array1, narray1 * VECTORDIM * sizeof(mt)); err = cudaMalloc((void**)&d_array2, narray2 * VECTORDIM * sizeof(mt)); err = cudaMalloc((void**)&d_output, narray1 * narray2 * sizeof(mt)); err = cudaMemcpy(d_array1, array1, narray1 * VECTORDIM * sizeof(mt), cudaMemcpyHostToDevice); err = cudaMemcpy(d_array2, array2, narray2 * VECTORDIM * sizeof(mt), cudaMemcpyHostToDevice); int threadsPerBlock = 512; int blocksPerGrid = (narray1 + threadsPerBlock - 1) / threadsPerBlock; printf("CUDA kernel launch with %d blocks of %d threads\ ", blocksPerGrid, threadsPerBlock); double iStart = cpuSecond(); DistGPU<<<blocksPerGrid, threadsPerBlock>>>(d_array1, d_array2, narray1, narray2, d_output); double iElaps = cpuSecond() - iStart; err = cudaMemcpy(outputGPU, d_output, narray1 * narray2 * sizeof(mt), cudaMemcpyDeviceToHost); printf("Total computation time is %lf \ " , iElaps); DistanceCPU(array1, array2, narray1, narray2, outputCPU); mt error = 0; for (long i = 0; i < narray1 * narray2; i++) { error += abs(outputCPU[i] - outputGPU[i]); } error /= (narray2 * narray1); for (int i = 0; i < 20; i++) { printf("CPU result %f \ ", outputCPU[i]); printf("GPU result %f \ ", outputGPU[i]); } printf("Error is %f \ ", error); delete [] array1; delete [] array2; delete [] outputCPU; delete [] outputGPU; return 0; } $ nvcc -o t415 t415.cu t415.cu(87): warning: variable"err" was set but never used $ ./t415 CUDA kernel launch with 14 blocks of 512 threads Total computation time is 0.000042 CPU result 28.795628 GPU result 28.795628 CPU result 50.995567 GPU result 50.995567 CPU result 46.970348 GPU result 46.970348 CPU result 29.031254 GPU result 29.031254 CPU result 111.297745 GPU result 111.297745 CPU result 19.145151 GPU result 19.145149 CPU result 20.508183 GPU result 20.508183 CPU result 133.916077 GPU result 133.916077 CPU result 84.315201 GPU result 84.315201 CPU result 48.804039 GPU result 48.804039 CPU result 26.388403 GPU result 26.388403 CPU result 150.009735 GPU result 150.009735 CPU result 108.421936 GPU result 108.421936 CPU result 73.092339 GPU result 73.092331 CPU result 79.486023 GPU result 79.486023 CPU result 89.990150 GPU result 89.990150 CPU result 20.142567 GPU result 20.142567 CPU result 43.482445 GPU result 43.482445 CPU result 29.460800 GPU result 29.460800 CPU result 86.545860 GPU result 86.545860 Error is 0.000000 |
一些注意事项:
-
仍有一些我尚未研究的差异。 GPU执行FMA收缩的方式可能与CPU代码不同。分析过程的下一步将是比较
float 与double 计算,以建立更接近正确结果的数字基线。在某些情况下,GPU产生的数字比相应的CPU代码更接近正确的结果,因此简单地假设CPU代码是正确的,然后要求解释为什么GPU代码不同的原因并不总是那么简单。正确的方法。这是这种错误的一个例子。 - 如果考虑普通平方版本,那么对我来说,这段代码确实需要或不需要在CPU和GPU版本之间具有浮点计算顺序差异对我来说并不明显,因此我不认为浮点(缺乏)关联性是这里的主要考虑因素。但是,我没有结论性的解释来解释其余的差异。需要做更多的工作(见上一项)。
-
至少在GPU上,普通平方运算可能比
powf( ,2) 更快 -
您在GPU代码上的时序测量仅捕获内核启动开销。内核启动是异步的。要捕获完整的内核执行持续时间,请在内核调用之后立即在计时区域中添加
cudaDeviceSynchronize(); 调用。
编辑:感谢@njuffa,他提醒我检查FMA收缩假设很容易,如果我们使用
的先前代码(
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | $ nvcc -o t415 t415.cu -fmad=false t415.cu(87): warning: variable"err" was set but never used $ ./t415 CUDA kernel launch with 14 blocks of 512 threads Total computation time is 0.000039 CPU result 28.795628 GPU result 28.795628 CPU result 50.995567 GPU result 50.995567 CPU result 46.970348 GPU result 46.970348 CPU result 29.031254 GPU result 29.031254 CPU result 111.297745 GPU result 111.297745 CPU result 19.145151 GPU result 19.145151 CPU result 20.508183 GPU result 20.508183 CPU result 133.916077 GPU result 133.916077 CPU result 84.315201 GPU result 84.315201 CPU result 48.804039 GPU result 48.804039 CPU result 26.388403 GPU result 26.388403 CPU result 150.009735 GPU result 150.009735 CPU result 108.421936 GPU result 108.421936 CPU result 73.092339 GPU result 73.092339 CPU result 79.486023 GPU result 79.486023 CPU result 89.990150 GPU result 89.990150 CPU result 20.142567 GPU result 20.142567 CPU result 43.482445 GPU result 43.482445 CPU result 29.460800 GPU result 29.460800 CPU result 86.545860 GPU result 86.545860 Error is 0.000000 |