关于cuda:多GPU代码中的异步纹理对象分配

Asynchronous texture object allocation in multi-GPU code

我有一些用于纹理对象分配和"主机到设备"复制的代码。这只是答案的修改。我没有明确使用流,只是cudaSetDevice()

此代码工作正常,但是,当我运行Visual Profiler时,我可以看到从主机到阵列的内存副本不是异步的。它们分别分配给自己的设备流,但是直到第一个完成(在2个GPU上运行)后,第二个才开始。我已经尝试过使用大图像,因此我确定它不会占用CPU的开销。

我的猜测是代码中需要同步才能停止CPU,但是我不知道是什么。我该怎么做才能使该循环异步?

MCVE:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
    void CreateTexture(int num_devices,float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage);

int main(void)
{

int deviceCount =0 ;
cudaGetDeviceCount(&deviceCount);

int nVoxelX=512;
int nVoxelY=512;
int nVoxelZ=512;
float* image=(float*)malloc(nVoxelX*nVoxelY*nVoxelZ*sizeof(float));

cudaTextureObject_t *texImg =new cudaTextureObject_t[deviceCount];
cudaArray **d_cuArrTex = new cudaArray*[deviceCount];

CreateTexture(deviceCount,image, nVoxelX,nVoxelY, nVoxelZ,d_cuArrTex,texImg);


}

实际功能:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
void CreateTexture(int num_devices, float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage)
{
    //size_t size_image=nVoxelX*nVoxelY*nVoxelZ;
    for (unsigned int i = 0; i < num_devices; i++){
        cudaSetDevice(i);

        //cudaArray Descriptor
        const cudaExtent extent = make_cudaExtent(nVoxelX, nVoxelY, nVoxelZ);
        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
        //cuda Array
        cudaMalloc3DArray(&d_cuArrTex[i], &channelDesc, extent);
        //cudaCheckErrors("Texture memory allocation fail");
        cudaMemcpy3DParms copyParams = {0};


        //Array creation
        copyParams.srcPtr   = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
        copyParams.dstArray = d_cuArrTex[i];
        copyParams.extent   = extent;
        copyParams.kind     = cudaMemcpyHostToDevice;
        cudaMemcpy3DAsync(&copyParams);
        //cudaCheckErrors("Texture memory data copy fail");


        //Array creation End
        cudaResourceDesc    texRes;
        memset(&texRes, 0, sizeof(cudaResourceDesc));
        texRes.resType = cudaResourceTypeArray;
        texRes.res.array.array  = d_cuArrTex[i];
        cudaTextureDesc     texDescr;
        memset(&texDescr, 0, sizeof(cudaTextureDesc));
        texDescr.normalizedCoords = false;
        texDescr.filterMode = cudaFilterModePoint;
        texDescr.addressMode[0] = cudaAddressModeBorder;
        texDescr.addressMode[1] = cudaAddressModeBorder;
        texDescr.addressMode[2] = cudaAddressModeBorder;
        texDescr.readMode = cudaReadModeElementType;
        cudaCreateTextureObject(&texImage[i], &texRes, &texDescr, NULL);
        //cudaCheckErrors("Texture object creation fail");
    }
}


我可以用代码看到的两个主要问题是:

  • 您的主机分配是可分页的分配。目标之一是主机内存的CUDA中的复制操作异步要求对主机内存进行固定分配。

  • 您在创建纹理循环中还有其他同步操作。根据我的经验,设备分配操作(在本例中为cudaMalloc3DArray)正在同步。我没有运行测试来确定cudaCreateTextureObject是否正在同步,但是如果它正在同步,我不会感到惊讶。因此,一般而言,我对异步的建议是使同步操作脱离循环。

  • 对于您而言,我们可以按以下方式重构您的代码,从nvprof

    的angular来看,这似乎允许操作重叠。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    $ cat t399.cu
    void CreateTexture(int num_devices, float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage)
    {
        //size_t size_image=nVoxelX*nVoxelY*nVoxelZ;

        const cudaExtent extent = make_cudaExtent(nVoxelX, nVoxelY, nVoxelZ);
        for (unsigned int i = 0; i < num_devices; i++){
            cudaSetDevice(i);

            //cudaArray Descriptor
            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
            //cuda Array
            cudaMalloc3DArray(&d_cuArrTex[i], &channelDesc, extent);
            //cudaCheckErrors("Texture memory allocation fail");
            }
        for (unsigned int i = 0; i < num_devices; i++){
            cudaSetDevice(i);
            cudaMemcpy3DParms copyParams = {0};
            //Array creation
            copyParams.srcPtr   = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
            copyParams.dstArray = d_cuArrTex[i];
            copyParams.extent   = extent;
            copyParams.kind     = cudaMemcpyHostToDevice;
            cudaMemcpy3DAsync(&copyParams);
            //cudaCheckErrors("Texture memory data copy fail");
            }
        for (unsigned int i = 0; i < num_devices; i++){
            cudaSetDevice(i);
            //Array creation End
            cudaResourceDesc    texRes;
            memset(&texRes, 0, sizeof(cudaResourceDesc));
            texRes.resType = cudaResourceTypeArray;
            texRes.res.array.array  = d_cuArrTex[i];
            cudaTextureDesc     texDescr;
            memset(&texDescr, 0, sizeof(cudaTextureDesc));
            texDescr.normalizedCoords = false;
            texDescr.filterMode = cudaFilterModePoint;
            texDescr.addressMode[0] = cudaAddressModeBorder;
            texDescr.addressMode[1] = cudaAddressModeBorder;
            texDescr.addressMode[2] = cudaAddressModeBorder;
            texDescr.readMode = cudaReadModeElementType;
            cudaCreateTextureObject(&texImage[i], &texRes, &texDescr, NULL);
            //cudaCheckErrors("Texture object creation fail");
        }
        for (unsigned int i = 0; i < num_devices; i++){
            cudaSetDevice(i);
            cudaDeviceSynchronize();
        }
    }

    int main(void)
    {
      int deviceCount =0 ;
      cudaGetDeviceCount(&deviceCount);

      int nVoxelX=512;
      int nVoxelY=512;
      int nVoxelZ=512;
      float* image;

      cudaHostAlloc(&image, nVoxelX*nVoxelY*nVoxelZ*sizeof(float), cudaHostAllocDefault);

      cudaTextureObject_t *texImg =new cudaTextureObject_t[deviceCount];
      cudaArray **d_cuArrTex = new cudaArray*[deviceCount];

      CreateTexture(deviceCount,image, nVoxelX,nVoxelY, nVoxelZ,d_cuArrTex,texImg);
    }


    $ nvcc -o t399 t399.cu
    $ cuda-memcheck ./t399
    ========= CUDA-MEMCHECK
    ========= ERROR SUMMARY: 0 errors
    $ nvprof --print-gpu-trace ./t399
    ==19953== NVPROF is profiling process 19953, command: ./t399
    ==19953== Profiling application: ./t399
    ==19953== Profiling result:
       Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
    1.55311s  90.735ms                    -               -         -         -         -  512.00MB  5.5106GB/s      Pinned       Array  Tesla P100-PCIE         1         7  [CUDA memcpy HtoA]
    1.55316s  90.640ms                    -               -         -         -         -  512.00MB  5.5163GB/s      Pinned       Array   Tesla K40m (1)         2        18  [CUDA memcpy HtoA]
    1.55318s  85.962ms                    -               -         -         -         -  512.00MB  5.8165GB/s      Pinned       Array  Tesla K20Xm (2)         3        29  [CUDA memcpy HtoA]
    1.55320s  89.908ms                    -               -         -         -         -  512.00MB  5.5612GB/s      Pinned       Array  Tesla K20Xm (3)         4        40  [CUDA memcpy HtoA]

    Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
    SSMem: Static shared memory allocated per CUDA block.
    DSMem: Dynamic shared memory allocated per CUDA block.
    SrcMemType: The type of source memory accessed by memory operation/copy
    DstMemType: The type of destination memory accessed by memory operation/copy
    $

    我的系统是一个4-GPU系统,两个根端口分别挂有两个GPU。因此,从探查器的angular来看,PCIE Gen3上的主机->设备固定的传输带宽正在从每个端口上的两个GPU的探查器angular分配,但仔细研究探查器的开始和持续时间表明,所有从探查器的angular来看,这4个是重叠的。