DX11 Compute Shader writes only to one index
我真的不知道这是怎么回事。
我有一个计算着色器,它吸收FFT结果(来自实际输入)并计算每个bin的功率,并将它们存储在不同的缓冲区(UAV)中。 FFT实现是D3DCSX库的实现。
有问题的着色器:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | struct Complex { float real; float imag; }; RWStructuredBuffer<Complex> g_result : register(u0); RWStructuredBuffer<float> g_powers : register(u1); [numthreads(1, 1, 1)] void main(uint3 id : SV_DispatchThreadID) { const uint bin = id.x; const float real = g_result[bin + 1].real; const float imag = g_result[bin + 1].imag; const float power = real * real + imag * imag; const float mag = sqrt(power); const float db = 10.0f * log10(1.0f + power); g_powers[bin] = power; } |
缓冲区创建代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | //The buffer in which the resulting powers are stored (m_result_buffer1) buffer_desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE; buffer_desc.ByteWidth = sizeof(float) * NumBins(); buffer_desc.CPUAccessFlags = 0; buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; buffer_desc.StructureByteStride = sizeof(float); buffer_desc.Usage = D3D11_USAGE_DEFAULT; hr = m_device->CreateBuffer ( &buffer_desc, nullptr, &m_result_buffer1 ); HR_THROW(); //UAV for m_result_buffer1 view_desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; view_desc.Buffer.FirstElement = 0; view_desc.Format = DXGI_FORMAT_R32_TYPELESS; view_desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW; view_desc.Buffer.NumElements = NumBins(); hr = m_device->CreateUnorderedAccessView ( m_result_buffer1, &view_desc, &m_result_view ); HR_THROW(); //Buffer for reading powers to the CPU buffer_desc.BindFlags = 0; buffer_desc.ByteWidth = sizeof(float) * NumBins(); buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; buffer_desc.MiscFlags = 0; buffer_desc.StructureByteStride = sizeof(float); buffer_desc.Usage = D3D11_USAGE_STAGING; hr = m_device->CreateBuffer ( &buffer_desc, nullptr, &m_result_buffer2 ); HR_THROW(); |
调度代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | CComPtr<ID3D11UnorderedAccessView> result_view; hr = m_fft->ForwardTransform ( m_sample_view, &result_view ); HR_THROW(); ID3D11UnorderedAccessView* views[] = { result_view, //FFT UAV (u0) m_result_view //Power UAV (u1) }; m_context->CSSetShader(m_power_cs, nullptr, 0); m_context->CSSetUnorderedAccessViews(0, 2, views, nullptr); m_context->Dispatch(NumBins(), 1, 1); |
最后是CPU映射代码:
1 2 3 4 5 6 7 | m_context->CopyResource(m_result_buffer2, m_result_buffer1); D3D11_MAPPED_SUBRESOURCE sub = { 0 }; m_context->Map(m_result_buffer2, 0, D3D11_MAP_READ, 0, &sub); memcpy(result, sub.pData, sizeof(float) * NumBins()); m_context->Unmap(m_result_buffer2, 0); |
发生的情况是此着色器似乎使每个线程都写入输出缓冲区中的相同索引。映射的缓冲区始终为第一个bin读取正确的值,然后为每个其他bin读取0.0f。 CPU上的等效代码运行正常。奇怪的是,我放置了条件语句,并且知道
我直觉是问题的根源是缓冲区创建代码或映射代码。我知道我在GPU上运行了正确数量的线程,并且调度ID是正确的,这是CPU端的结果是错误的。
问题解决了!
我正在使用
1 2 3 4 5 6 7 8 9 10 11 12 | RWByteAddressBuffer g_result : register(u0); RWStructuredBuffer<float> g_decibels : register(u1); [numthreads(256, 1, 1)] void main(uint3 id : SV_DispatchThreadID) { const float real = asfloat(g_result.Load(id.x * 8 + 0)); const float imag = asfloat(g_result.Load(id.x * 8 + 4)); const float power = real * real + imag * imag; const float db = 10.0f * log10(1.0f + power); g_decibels[id.x] = db; } |
不过,我将分贝缓冲区的描述更改为结构化缓冲区的描述,只是为了让我更轻松:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | buffer_desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE; buffer_desc.ByteWidth = sizeof(float) * NumBins(); buffer_desc.CPUAccessFlags = 0; buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; buffer_desc.StructureByteStride = sizeof(float); buffer_desc.Usage = D3D11_USAGE_DEFAULT; hr = m_device->CreateBuffer ( &buffer_desc, nullptr, &m_result_buffer1 ); HR_THROW(); view_desc.Buffer.FirstElement = 0; view_desc.Buffer.Flags = 0; view_desc.Buffer.NumElements = NumBins(); view_desc.Format = DXGI_FORMAT_UNKNOWN; view_desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; hr = m_device->CreateUnorderedAccessView ( m_result_buffer1, &view_desc, &m_result_view ); HR_THROW(); |
这就是
对于我来说仍然未知的是,当仅需要访问时结果缓冲区是否可读写是否重要-如果将