关于C#：NVIDIA Visual profiler不会生成时间轴

NVIDIA Visual profiler does not generate a timeline

我的问题与[在此之前向SO提出的问题] [1]几乎相同。但是尚未提供任何答案，因此，我要提出一个单独的问题。

我在Windows-7操作系统上使用CUDA 7.0工具包。我正在使用VS-2013。

我试图生成向量加法样本程序的时间表，并且它起作用了。但是，当我按照完全相同的步骤生成自己的代码的时间轴时，它将继续显示消息"正在运行应用程序以生成时间轴"。我知道内核被调用并且一切正常。

完成与CUDA相关的所有操作后，也会出现

cudaDeviceReset()调用。

程序：我已经更改了原始问题，以提供一个最小的可行示例，它可以产生相同的问题。不管我放置cudaDeviceReset()的位置如何，以下代码都不会使用nvvp生成时间轴。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

#include"cuda_runtime.h"
#include"device_launch_parameters.h"

//OpenCV
#include <opencv2/highgui.hpp>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>

#include <stdio.h>

using namespace cv;

__global__ void colorTransformation_kernel(int numChannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst)
{
// Calculate our pixel's location
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;

// Operate only if we are in the correct boundaries
if (x >= 0 && x < iw && y >= 0 && y < ih)
{
ptr_dst[numChannels* (iw*y + x) + 0] = ptr_source[numChannels* (iw*y + x) + 0];
ptr_dst[numChannels* (iw*y + x) + 1] = ptr_source[numChannels* (iw*y + x) + 1];
ptr_dst[numChannels* (iw*y + x) + 2] = ptr_source[numChannels* (iw*y + x) + 2];
}
}

int main()
{
while (1)
{
Mat image(400, 400, CV_8UC3, Scalar(0, 0, 255));
unsigned char *h_src = image.data;
size_t numBytes = image.rows * image.cols * 3;
int numChannels = 3;

unsigned char *dev_src, *dev_dst, *h_dst;

//Allocate memomry at device for SOURCE and DESTINATION and get their pointers
cudaMalloc((void**)&dev_src, numBytes * sizeof(unsigned char));
cudaMalloc((void**)&dev_dst, numBytes * sizeof(unsigned char));

////Copy the source image to the device i.e. GPU
cudaMemcpy(dev_src, h_src, numBytes * sizeof(unsigned char), cudaMemcpyHostToDevice);

////KERNEL
dim3 numOfBlocks(3 * (image.cols / 20), 3 * (image.rows / 20)); //multiplied by 3 because we have 3 channel image now
dim3 numOfThreadsPerBlocks(20, 20);
colorTransformation_kernel << <numOfBlocks, numOfThreadsPerBlocks >> >(numChannels, image.cols, image.rows, dev_src, dev_dst);
cudaDeviceSynchronize();

//Get the processed image
Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3);
h_dst = org_dijSDK_img.data;
cudaMemcpy(h_dst, dev_dst, numBytes * sizeof(unsigned char), cudaMemcpyDeviceToHost);

//DISPLAY PROCESSED IMAGE
imshow("Processed dijSDK image", org_dijSDK_img);
waitKey(33);

}

cudaDeviceReset();
return 0;
}

非常重要的线索：如果我注释while(1)行，因此仅运行一次代码，则nvvp会生成时间轴。但是在我的原始项目中，我无法通过这样做获得时间轴配置文件，因为它包含多线程和其他内容，因此在第一次运行时就没有要处理的图像。因此，我必须需要某种方法来生成包含无限while loop。

的代码的时间轴。

相关讨论

我的代码中的问题是无限的while loop，由于从未调用过cudaDeviceReset()。有两种可能的解决方案来处理这种情况：

如果您只想查看时间轴分析，则只需注释您的while loop，nvvp就能达到main()末尾的cudaDeviceReset()。

在某些情况下，您必须在程序内保持循环。例如，在我的包含多线程的原始项目中，while loop的初始180运行期间没有要处理的映像。要处理这种情况，请用for loop替换while循环，该循环可以运行有限次。例如，以下代码帮助我获得了4次运行的时间线分析。我只发布修改后的main()。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

int main()
{
cudaStream_t stream_one;
cudaStream_t stream_two;
cudaStream_t stream_three;

//while (1)
for (int i = 0; i < 4; i++)
{
cudaStreamCreate(&stream_one);
cudaStreamCreate(&stream_two);
cudaStreamCreate(&stream_three);

Mat image = imread("DijSDK_test_image.jpg", 1);
//Mat image(1080, 1920, CV_8UC3, Scalar(0,0,255));
size_t numBytes = image.rows * image.cols * 3;
int numChannels = 3;

int iw = image.rows;
int ih = image.cols;
size_t totalMemSize = numBytes * sizeof(unsigned char);
size_t oneThirdMemSize = totalMemSize / 3;

unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;

//Allocate memomry at device for SOURCE and DESTINATION and get their pointers
cudaMalloc((void**)&dev_src_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_3, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_3, (totalMemSize) / 3);

//Get the processed image
Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3, Scalar(0, 0, 255));
h_dst = org_dijSDK_img.data;
//copy new data of image to the host pointer
h_src = image.data;

//Copy the source image to the device i.e. GPU
cudaMemcpyAsync(dev_src_1, h_src, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_one);
cudaMemcpyAsync(dev_src_2, h_src + oneThirdMemSize, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_two);
cudaMemcpyAsync(dev_src_3, h_src + (2 * oneThirdMemSize), (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_three);

//KERNEL--stream-1
callMultiStreamingCudaKernel(dev_src_1, dev_dst_1, numChannels, iw, ih, &stream_one);
//KERNEL--stream-2
callMultiStreamingCudaKernel(dev_src_2, dev_dst_2, numChannels, iw, ih, &stream_two);
//KERNEL--stream-3
callMultiStreamingCudaKernel(dev_src_3, dev_dst_3, numChannels, iw, ih, &stream_three);

//RESULT copy: GPU to CPU
cudaMemcpyAsync(h_dst, dev_dst_1, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_one);
cudaMemcpyAsync(h_dst + oneThirdMemSize, dev_dst_2, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_two);
cudaMemcpyAsync(h_dst + (2 * oneThirdMemSize), dev_dst_3, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_three);

// wait for results
cudaStreamSynchronize(stream_one);
cudaStreamSynchronize(stream_two);
cudaStreamSynchronize(stream_three);

//Assign the processed data to the display image.
org_dijSDK_img.data = h_dst;
//DISPLAY PROCESSED IMAGE
imshow("Processed dijSDK image", org_dijSDK_img);
waitKey(33);

}

cudaDeviceReset();
return 0;
}