The Compute pipeline in DirectX 12 is arguably much easier than the graphics pipeline. There's much less complexity to configuring it, and the separation of resources and computations is more explicit making it easier to wrap your head around. Let's review the compute pipeline works, how to write compute shaders, and execute computations with DirectX 12.
Resources
Like the Raster or Ray Tracing pipelines, computations will require resources, be it Structured Buffers of triangle data, render target Unordered Access Views you're reading/writing to, or Constant Buffer Views of common data.
Root Signatures
Like other pipelines, you can define what resources will be accessible via a Root Signature.
// Create the root signature.
D3D12_FEATURE_DATA_ROOT_SIGNATURE featureData = {};
// This is the highest version the sample supports. If
// CheckFeatureSupport succeeds, the HighestVersion returned will not be
// greater than this.
featureData.HighestVersion = D3D_ROOT_SIGNATURE_VERSION_1_1;
if (FAILED(device->CheckFeatureSupport(D3D12_FEATURE_ROOT_SIGNATURE,
&featureData,
sizeof(featureData))))
{
featureData.HighestVersion = D3D_ROOT_SIGNATURE_VERSION_1_0;
}
D3D12_DESCRIPTOR_RANGE1 ranges[1];
ranges[0].BaseShaderRegister = 0;
ranges[0].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV;
ranges[0].NumDescriptors = 1;
ranges[0].RegisterSpace = 0;
ranges[0].OffsetInDescriptorsFromTableStart = 0;
ranges[0].Flags = D3D12_DESCRIPTOR_RANGE_FLAG_DATA_VOLATILE;
D3D12_ROOT_PARAMETER1 rootParameters[1];
rootParameters[0].ParameterType =
D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
rootParameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
rootParameters[0].DescriptorTable.NumDescriptorRanges = 1;
rootParameters[0].DescriptorTable.pDescriptorRanges = ranges;
D3D12_VERSIONED_ROOT_SIGNATURE_DESC rootSignatureDesc;
rootSignatureDesc.Version = D3D_ROOT_SIGNATURE_VERSION_1_1;
rootSignatureDesc.Desc_1_1.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE;
rootSignatureDesc.Desc_1_1.NumParameters = 1;
rootSignatureDesc.Desc_1_1.pParameters = rootParameters;
rootSignatureDesc.Desc_1_1.NumStaticSamplers = 0;
rootSignatureDesc.Desc_1_1.pStaticSamplers = nullptr;
ID3DBlob* signatureBlob;
ID3DBlob* error;
try
{
ThrowIfFailed(D3D12SerializeVersionedRootSignature(
&rootSignatureDesc, &signatureBlob, &error));
ThrowIfFailed(mDevice->CreateRootSignature(
0, signature->GetBufferPointer(), signatureBlob->GetBufferSize(),
IID_PPV_ARGS(&rootSignature)));
rootSignature->SetName(L"Hello Compute Root Signature");
}
catch (std::exception e)
{
const char* errStr = (const char*)error->GetBufferPointer();
std::cout << errStr;
error->Release();
error = nullptr;
}
if (signatureBlob)
{
signatureBlob->Release();
signatureBlob = nullptr;
}Compute Shaders
Compute shader compilation is much easier than in raster or ray tracing, as there's only one programmable stage.
RWTexture2D<float4> tOutput : register(u0);
[numthreads(16, 16, 1)]
void main(uint3 groupThreadID : SV_GroupThreadID, // The current thread group (so pixel) of this group defined by `numthreads`
uint3 groupID : SV_GroupID, // The current thread group ID, the group of threads defined in `Dispatch(x,y,z)`
uint groupIndex : SV_GroupIndex, // The index of this group (so represent the group ID linearly)
uint3 dispatchThreadID: SV_DispatchThreadID) // Your current pixel
{
tOutput[dispatchThreadID.xy] = float4( float(groupThreadID.x) / 16.0, float(groupThreadID.y) / 16.0, dispatchThreadID.x / 1280.0, 1.0);
}Pipeline State
The compute pipeline only expects a shader and root signature. That's it!
D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {};
psoDesc.pRootSignature = rootSignature;
D3D12_SHADER_BYTECODE csBytecode;
csBytecode.pShaderBytecode = compShader->GetBufferPointer();
csBytecode.BytecodeLength = compShader->GetBufferSize();
psoDesc.CS = csBytecode;
try
{
ThrowIfFailed(mDevice->CreateComputePipelineState(
&psoDesc, IID_PPV_ARGS(&pipelineState)));
}
catch (std::exception e)
{
std::cout << "Failed to create Compute Pipeline!";
}
if (compShader)
{
compShader->Release();
compShader = nullptr;
}Unordered Access View
Let's create an unordered access view of a render target we'll be writing data to:
// Create the Temp Texture
D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
heapDesc.NumDescriptors = 1;
heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
ThrowIfFailed(
mDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&mUavHeap)));
D3D12_RESOURCE_DESC texResourceDesc = {};
texResourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
texResourceDesc.Alignment = 0;
texResourceDesc.Width = mWidth;
texResourceDesc.Height = mHeight;
texResourceDesc.DepthOrArraySize = 1;
texResourceDesc.MipLevels = 1;
texResourceDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
texResourceDesc.SampleDesc.Count = 1;
texResourceDesc.SampleDesc.Quality = 0;
texResourceDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
texResourceDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
D3D12_CLEAR_VALUE clearValue = {};
clearValue.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
clearValue.Color[0] = clearValue.Color[1] = clearValue.Color[2] =
clearValue.Color[3] = 1.f;
D3D12_HEAP_PROPERTIES heapProps;
heapProps.Type = D3D12_HEAP_TYPE_DEFAULT;
heapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
heapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
heapProps.CreationNodeMask = 1;
heapProps.VisibleNodeMask = 1;
ThrowIfFailed(mDevice->CreateCommittedResource(
&heapProps, D3D12_HEAP_FLAG_NONE, &texResourceDesc,
D3D12_RESOURCE_STATE_UNORDERED_ACCESS, nullptr,
IID_PPV_ARGS(&mTexResource)));
mTexResource->SetName(L"Compute Target");
mUAVDescriptorSize = mDevice->GetDescriptorHandleIncrementSize(
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
auto AllocateDescriptor =
[&](D3D12_CPU_DESCRIPTOR_HANDLE* cpuDescriptor,
UINT descriptorIndexToUse)
{
auto descriptorHeapCpuBase =
mUavHeap->GetCPUDescriptorHandleForHeapStart();
if (descriptorIndexToUse >= mUavHeap->GetDesc().NumDescriptors)
{
descriptorIndexToUse = mDescriptorsAllocated++;
}
*cpuDescriptor = D3D12_CPU_DESCRIPTOR_HANDLE{
descriptorHeapCpuBase.ptr +
INT64(descriptorIndexToUse) * INT64(mUAVDescriptorSize)};
return descriptorIndexToUse;
};
heapIndex = AllocateDescriptor(&uavCPUHandle, heapIndex);
uavGPUHandle = D3D12_GPU_DESCRIPTOR_HANDLE{
mUavHeap->GetGPUDescriptorHandleForHeapStart().ptr +
INT64(0) * INT64(mUAVDescriptorSize)};
D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
uavDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
uavDesc.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE2D;
mDevice->CreateUnorderedAccessView(mTexResource, nullptr, &uavDesc,
uavCPUHandle);Compute Calls
The core of executing compute calls is calling Dispatch with a given set of thread groups as the arguments. We'll be executing a screen space compute shader, so there will be a group for every 16x16 block in the screen.
void setupCommands()
{
ThrowIfFailed(commandAllocator->Reset());
ThrowIfFailed(commandList->Reset(commandAllocator, mPipelineState));
// Set necessary state.
commandList->SetComputeRootSignature(rootSignature);
ID3D12DescriptorHeap* pDescriptorHeaps[] = {mUavHeap};
commandList->SetDescriptorHeaps(_countof(pDescriptorHeaps),
pDescriptorHeaps);
commandList->SetComputeRootDescriptorTable(0, uavGPUHandle);
auto divCiel = [](unsigned val, unsigned x) -> unsigned
{ return val / x + ((val % x) > 0 ? 1 : 0); };
commandList->Dispatch(divCiel(width, 16), divCiel(height, 16), 1);
D3D12_RESOURCE_BARRIER preCopyBarriers[2];
preCopyBarriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
mRenderTargets[frameIndex], D3D12_RESOURCE_STATE_PRESENT,
D3D12_RESOURCE_STATE_COPY_DEST);
preCopyBarriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
mTexResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
D3D12_RESOURCE_STATE_COPY_SOURCE);
commandList->ResourceBarrier(ARRAYSIZE(preCopyBarriers), preCopyBarriers);
commandList->CopyResource(renderTargets[frameIndex], texResource);
D3D12_RESOURCE_BARRIER postCopyBarriers[2];
postCopyBarriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
mRenderTargets[fameIndex], D3D12_RESOURCE_STATE_COPY_DEST,
D3D12_RESOURCE_STATE_PRESENT);
postCopyBarriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
mTexResource, D3D12_RESOURCE_STATE_COPY_SOURCE,
D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
CD3DX12_RESOURCE_BARRIER result = {};
D3D12_RESOURCE_BARRIER& barrier = result;
result.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
result.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
barrier.Transition.pResource = mTexResource;
barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE;
barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
commandList->ResourceBarrier(ARRAYSIZE(postCopyBarriers),
postCopyBarriers);
ThrowIfFailed(commandList->Close());
}
Alain Galvan