Asynchronous texture object allocation in multi-GPU code
我有一些用于纹理对象分配和"主机到设备"复制的代码。这只是答案的修改。我没有明确使用流,只是
此代码工作正常,但是,当我运行Visual Profiler时,我可以看到从主机到阵列的内存副本不是异步的。它们分别分配给自己的设备流,但是直到第一个完成(在2个GPU上运行)后,第二个才开始。我已经尝试过使用大图像,因此我确定它不会占用CPU的开销。
我的猜测是代码中需要同步才能停止CPU,但是我不知道是什么。我该怎么做才能使该循环异步?
MCVE:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | void CreateTexture(int num_devices,float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage); int main(void) { int deviceCount =0 ; cudaGetDeviceCount(&deviceCount); int nVoxelX=512; int nVoxelY=512; int nVoxelZ=512; float* image=(float*)malloc(nVoxelX*nVoxelY*nVoxelZ*sizeof(float)); cudaTextureObject_t *texImg =new cudaTextureObject_t[deviceCount]; cudaArray **d_cuArrTex = new cudaArray*[deviceCount]; CreateTexture(deviceCount,image, nVoxelX,nVoxelY, nVoxelZ,d_cuArrTex,texImg); } |
实际功能:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | void CreateTexture(int num_devices, float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage) { //size_t size_image=nVoxelX*nVoxelY*nVoxelZ; for (unsigned int i = 0; i < num_devices; i++){ cudaSetDevice(i); //cudaArray Descriptor const cudaExtent extent = make_cudaExtent(nVoxelX, nVoxelY, nVoxelZ); cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); //cuda Array cudaMalloc3DArray(&d_cuArrTex[i], &channelDesc, extent); //cudaCheckErrors("Texture memory allocation fail"); cudaMemcpy3DParms copyParams = {0}; //Array creation copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[i]; copyParams.extent = extent; copyParams.kind = cudaMemcpyHostToDevice; cudaMemcpy3DAsync(©Params); //cudaCheckErrors("Texture memory data copy fail"); //Array creation End cudaResourceDesc texRes; memset(&texRes, 0, sizeof(cudaResourceDesc)); texRes.resType = cudaResourceTypeArray; texRes.res.array.array = d_cuArrTex[i]; cudaTextureDesc texDescr; memset(&texDescr, 0, sizeof(cudaTextureDesc)); texDescr.normalizedCoords = false; texDescr.filterMode = cudaFilterModePoint; texDescr.addressMode[0] = cudaAddressModeBorder; texDescr.addressMode[1] = cudaAddressModeBorder; texDescr.addressMode[2] = cudaAddressModeBorder; texDescr.readMode = cudaReadModeElementType; cudaCreateTextureObject(&texImage[i], &texRes, &texDescr, NULL); //cudaCheckErrors("Texture object creation fail"); } } |
我可以用代码看到的两个主要问题是:
您的主机分配是可分页的分配。目标之一是主机内存的CUDA中的复制操作异步要求对主机内存进行固定分配。
您在创建纹理循环中还有其他同步操作。根据我的经验,设备分配操作(在本例中为
对于您而言,我们可以按以下方式重构您的代码,从
的angular来看,这似乎允许操作重叠。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | $ cat t399.cu void CreateTexture(int num_devices, float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage) { //size_t size_image=nVoxelX*nVoxelY*nVoxelZ; const cudaExtent extent = make_cudaExtent(nVoxelX, nVoxelY, nVoxelZ); for (unsigned int i = 0; i < num_devices; i++){ cudaSetDevice(i); //cudaArray Descriptor cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); //cuda Array cudaMalloc3DArray(&d_cuArrTex[i], &channelDesc, extent); //cudaCheckErrors("Texture memory allocation fail"); } for (unsigned int i = 0; i < num_devices; i++){ cudaSetDevice(i); cudaMemcpy3DParms copyParams = {0}; //Array creation copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[i]; copyParams.extent = extent; copyParams.kind = cudaMemcpyHostToDevice; cudaMemcpy3DAsync(©Params); //cudaCheckErrors("Texture memory data copy fail"); } for (unsigned int i = 0; i < num_devices; i++){ cudaSetDevice(i); //Array creation End cudaResourceDesc texRes; memset(&texRes, 0, sizeof(cudaResourceDesc)); texRes.resType = cudaResourceTypeArray; texRes.res.array.array = d_cuArrTex[i]; cudaTextureDesc texDescr; memset(&texDescr, 0, sizeof(cudaTextureDesc)); texDescr.normalizedCoords = false; texDescr.filterMode = cudaFilterModePoint; texDescr.addressMode[0] = cudaAddressModeBorder; texDescr.addressMode[1] = cudaAddressModeBorder; texDescr.addressMode[2] = cudaAddressModeBorder; texDescr.readMode = cudaReadModeElementType; cudaCreateTextureObject(&texImage[i], &texRes, &texDescr, NULL); //cudaCheckErrors("Texture object creation fail"); } for (unsigned int i = 0; i < num_devices; i++){ cudaSetDevice(i); cudaDeviceSynchronize(); } } int main(void) { int deviceCount =0 ; cudaGetDeviceCount(&deviceCount); int nVoxelX=512; int nVoxelY=512; int nVoxelZ=512; float* image; cudaHostAlloc(&image, nVoxelX*nVoxelY*nVoxelZ*sizeof(float), cudaHostAllocDefault); cudaTextureObject_t *texImg =new cudaTextureObject_t[deviceCount]; cudaArray **d_cuArrTex = new cudaArray*[deviceCount]; CreateTexture(deviceCount,image, nVoxelX,nVoxelY, nVoxelZ,d_cuArrTex,texImg); } $ nvcc -o t399 t399.cu $ cuda-memcheck ./t399 ========= CUDA-MEMCHECK ========= ERROR SUMMARY: 0 errors $ nvprof --print-gpu-trace ./t399 ==19953== NVPROF is profiling process 19953, command: ./t399 ==19953== Profiling application: ./t399 ==19953== Profiling result: Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name 1.55311s 90.735ms - - - - - 512.00MB 5.5106GB/s Pinned Array Tesla P100-PCIE 1 7 [CUDA memcpy HtoA] 1.55316s 90.640ms - - - - - 512.00MB 5.5163GB/s Pinned Array Tesla K40m (1) 2 18 [CUDA memcpy HtoA] 1.55318s 85.962ms - - - - - 512.00MB 5.8165GB/s Pinned Array Tesla K20Xm (2) 3 29 [CUDA memcpy HtoA] 1.55320s 89.908ms - - - - - 512.00MB 5.5612GB/s Pinned Array Tesla K20Xm (3) 4 40 [CUDA memcpy HtoA] Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows. SSMem: Static shared memory allocated per CUDA block. DSMem: Dynamic shared memory allocated per CUDA block. SrcMemType: The type of source memory accessed by memory operation/copy DstMemType: The type of destination memory accessed by memory operation/copy $ |
我的系统是一个4-GPU系统,两个根端口分别挂有两个GPU。因此,从探查器的angular来看,PCIE Gen3上的主机->设备固定的传输带宽正在从每个端口上的两个GPU的探查器angular分配,但仔细研究探查器的开始和持续时间表明,所有从探查器的angular来看,这4个是重叠的。