-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Global scan #665
base: master
Are you sure you want to change the base?
Global scan #665
Conversation
…but not working yet
…but not working yet
[CI]: Can one of the admins verify this patch? |
#ifndef NBL_BUILTIN_MAX_SCAN_LEVELS | ||
#define NBL_BUILTIN_MAX_SCAN_LEVELS 7 | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do as NBL_CONSTEXPR
instead of define
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also this better be formulated slightly differently, in terms of reduction passes, not reduction + downsweep
uint32_t topLevel; | ||
uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use uint16_t for this
uint32_t topLevel; | ||
uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; | ||
}; | ||
|
||
Parameters_t getParameters(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no need for such a forward decl anymore
struct DefaultSchedulerParameters_t | ||
{ | ||
uint32_t finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1]; | ||
uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; | ||
|
||
}; | ||
|
||
DefaultSchedulerParameters_t getSchedulerParameters(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
split out schedulers into separate headers
template<typename Storage_t> | ||
void getData( | ||
inout Storage_t data, | ||
in uint levelInvocationIndex, | ||
in uint localWorkgroupIndex, | ||
in uint treeLevel, | ||
in uint pseudoLevel | ||
NBL_REF_ARG(Storage_t) data, | ||
NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, | ||
NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, | ||
NBL_CONST_REF_ARG(uint32_t) treeLevel, | ||
NBL_CONST_REF_ARG(uint32_t) pseudoLevel | ||
); | ||
} | ||
} | ||
} | ||
#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ | ||
#endif | ||
|
||
#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_ | ||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
template<typename Storage_t> | ||
void setData( | ||
in Storage_t data, | ||
in uint levelInvocationIndex, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no more forward declarations, just let it be an accessor.
Also the level
const-ref-args can be rolled up into a single struct SDataIndex
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
|
||
#ifndef _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_ | ||
#define _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_ | ||
|
||
// choerent -> globallycoherent No newline at end of file | ||
#include "nbl/builtin/hlsl/scan/declarations.hlsl" | ||
#include "nbl/builtin/hlsl/workgroup/basic.hlsl" | ||
|
||
// coherent -> globallycoherent | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
|
||
template<uint32_t scratchElementCount=scratchSz> // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... | ||
struct Scratch | ||
{ | ||
uint32_t workgroupsStarted; | ||
uint32_t data[scratchElementCount]; | ||
}; | ||
|
||
[[vk::binding(0 ,0)]] RWStructuredBuffer<uint32_t /*Storage_t*/> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?) | ||
[[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model | ||
|
||
template<typename Storage_t, bool isExclusive=false> | ||
void getData( | ||
NBL_REF_ARG(Storage_t) data, | ||
NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, | ||
NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, | ||
NBL_CONST_REF_ARG(uint32_t) treeLevel, | ||
NBL_CONST_REF_ARG(uint32_t) pseudoLevel | ||
) | ||
{ | ||
const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders | ||
|
||
uint32_t offset = levelInvocationIndex; | ||
const bool notFirstOrLastLevel = bool(pseudoLevel); | ||
if (notFirstOrLastLevel) | ||
offset += params.temporaryStorageOffset[pseudoLevel-1u]; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove this whole file, it should be userspace
// TODO: Can we make it a static variable? | ||
groupshared uint32_t wgScratch[SharedScratchSz]; | ||
|
||
#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" | ||
|
||
template<uint16_t offset> | ||
struct WGScratchProxy | ||
{ | ||
uint32_t get(const uint32_t ix) | ||
{ | ||
return wgScratch[ix+offset]; | ||
} | ||
void set(const uint32_t ix, const uint32_t value) | ||
{ | ||
wgScratch[ix+offset] = value; | ||
} | ||
|
||
uint32_t atomicAdd(uint32_t ix, uint32_t val) | ||
{ | ||
return glsl::atomicAdd(wgScratch[ix + offset], val); | ||
} | ||
|
||
void workgroupExecutionAndMemoryBarrier() | ||
{ | ||
nbl::hlsl::glsl::barrier(); | ||
//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above | ||
} | ||
}; | ||
static WGScratchProxy<0> accessor; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
scratches are userspace
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use accessors
/** | ||
* Required since we rely on SubgroupContiguousIndex instead of | ||
* gl_LocalInvocationIndex which means to match the global index | ||
* we can't use the gl_GlobalInvocationID but an index based on | ||
* SubgroupContiguousIndex. | ||
*/ | ||
uint32_t globalIndex() | ||
{ | ||
return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we have this in a header already
struct ScanPushConstants | ||
{ | ||
nbl::hlsl::scan::Parameters_t scanParams; | ||
nbl::hlsl::scan::DefaultSchedulerParameters_t schedulerParams; | ||
}; | ||
|
||
[[vk::push_constant]] | ||
ScanPushConstants spc; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
everything affecting the pipeline layout should be userspace
#ifndef _NBL_HLSL_MAIN_DEFINED_ | ||
[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)] | ||
void CSMain() | ||
[numthreads(WORKGROUP_SIZE,1,1)] | ||
void main() | ||
{ | ||
if (bool(nbl::hlsl::scan::getIndirectElementCount())) | ||
nbl::hlsl::scan::main(); | ||
if(bool(nbl::hlsl::scan::getIndirectElementCount())) { | ||
// TODO call main from virtual_workgroup.hlsl | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
all this should be userspace, also there will be very little difference between direct and indirect
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
template<class Binop, class Storage_t> | ||
void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex) | ||
template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor, class device_capabilities=void> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why not template on the workgroup scan instead?
Within you can alias and extract:
- binop
- storage_t
- exclusive or not
- item count
- smem accessor
- device traits necessary
const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + SubgroupContiguousIndex(); | ||
const bool lastInvocationInGroup = SubgroupContiguousIndex() == (gl_WorkGroupSize().x - 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shouldn't ItemCount
be used instead of glsl::gl_WorkGroupSize().x
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
most definitely
// push constants of the default direct scan pipeline provide both aux memory offset params and scheduling params | ||
struct DefaultPushConstants | ||
{ | ||
Parameters scanParams; | ||
SchedulerParameters schedulerParams; | ||
}; | ||
|
||
struct DispatchInfo | ||
{ | ||
DispatchInfo() : wg_count(0u) | ||
{ | ||
} | ||
|
||
// in case we scan very few elements, you don't want to launch workgroups that wont do anything | ||
DispatchInfo(const IPhysicalDevice::SLimits& limits, const uint32_t elementCount, const uint32_t workgroupSize) | ||
{ | ||
constexpr auto workgroupSpinningProtection = 4u; // to prevent first workgroup starving/idling on level 1 after finishing level 0 early | ||
wg_count = limits.computeOptimalPersistentWorkgroupDispatchSize(elementCount,workgroupSize,workgroupSpinningProtection); | ||
} | ||
|
||
uint32_t wg_count; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move everything to HLSL, you can also have a look at PR #643 where the computeOptimalPErsistentWorkgroupDispatchSize
may move to device_capability_traits
protected: | ||
// REVIEW: Does it need an empty destructor? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no a default one will be made
// REVIEW: Does it need an empty destructor? | ||
|
||
core::smart_refctd_ptr<ILogicalDevice> m_device; | ||
core::smart_refctd_ptr<IGPUDescriptorSetLayout> m_ds_layout; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you wont be using any descriptors, so no need for a layout
static inline void updateDescriptorSet(ILogicalDevice* device, IGPUDescriptorSet* set, const asset::SBufferRange<IGPUBuffer>& input_range, const asset::SBufferRange<IGPUBuffer>& scratch_range) | ||
{ | ||
IGPUDescriptorSet::SDescriptorInfo infos[2]; | ||
infos[0].desc = input_range.buffer; | ||
infos[0].info.buffer = {input_range.offset,input_range.size}; | ||
infos[1].desc = scratch_range.buffer; | ||
infos[1].info.buffer = {scratch_range.offset,scratch_range.size}; | ||
|
||
video::IGPUDescriptorSet::SWriteDescriptorSet writes[2]; | ||
for (auto i=0u; i<2u; i++) | ||
{ | ||
writes[i] = { .dstSet = set,.binding = i,.arrayElement = 0u,.count = 1u,.info = infos+i }; | ||
} | ||
|
||
device->updateDescriptorSets(2, writes, 0u, nullptr); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no more descriptor sets
inline void buildParameters(const uint32_t elementCount, DefaultPushConstants& pushConstants, DispatchInfo& dispatchInfo) | ||
{ | ||
pushConstants.schedulerParams = SchedulerParameters(pushConstants.scanParams,elementCount,m_workgroupSize); | ||
dispatchInfo = DispatchInfo(m_device->getPhysicalDevice()->getLimits(),elementCount,m_workgroupSize); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move to HLSL
const IGPUDescriptorSetLayout::SBinding bindings[2] = { | ||
{ 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer | ||
{ 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch | ||
}; | ||
|
||
m_ds_layout = m_device->createDescriptorSetLayout(bindings); | ||
assert(m_ds_layout && "CArithmeticOps Descriptor Set Layout was not created successfully"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no bindings
inline core::smart_refctd_ptr<asset::ICPUShader> createBaseShader(const char* shaderFile, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const | ||
{ | ||
auto system = m_device->getPhysicalDevice()->getSystem(); | ||
core::smart_refctd_ptr<const system::IFile> hlsl; | ||
{ | ||
auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile> | ||
{ | ||
nbl::system::ISystem::future_t<core::smart_refctd_ptr<nbl::system::IFile>> future; | ||
system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE); | ||
if (future.wait()) | ||
return future.copy(); | ||
return nullptr; | ||
}; | ||
|
||
// hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect_reduce.hlsl"); ?? | ||
hlsl = loadBuiltinData(shaderFile); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is super convoluted, why is the user supposed to know the path to the file you'll load?
break; | ||
} | ||
|
||
return asset::CHLSLCompiler::createOverridenCopy(cpushader.get(), "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_SZ %d\n", m_workgroupSize, storageType, binop, scratchSz); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it would be better to build the shader source instead of overriding some random shader
CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device) : CArithmeticOps(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} | ||
|
||
CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if you don't plan to make CArithmeticOps
be usable on its own, make the constructors protected
{ | ||
|
||
public: | ||
CReduce(core::smart_refctd_ptr<ILogicalDevice>&& device) : CReduce(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't repeat the roundDownToPoT
everywhere, you should rely on CArithmeticOps
to set that up for you
|
||
#include "nbl/builtin/hlsl/scan/declarations.hlsl" | ||
static_assert(NBL_BUILTIN_MAX_LEVELS & 0x1, "NBL_BUILTIN_MAX_LEVELS must be odd!"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if we don't have our own static_assert
in HLSL, then stick this inside a ifndef __HLSL_VERSION
inside the HLSL header
@@ -143,270 +142,43 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
update the comments about the algo between lines 112-141 because we can't spinwait, we need to outline the new algo properly
EST_INCLUSIVE = _NBL_GLSL_SCAN_TYPE_INCLUSIVE_, | ||
EST_INCLUSIVE = 0u, | ||
// computes output[n] = Sum_{i<n}(input[i]), meaning first element is identity | ||
EST_EXCLUSIVE = _NBL_GLSL_SCAN_TYPE_EXCLUSIVE_, | ||
EST_EXCLUSIVE, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd template on a bool instead
core::smart_refctd_ptr<asset::ICPUShader> CReduce::createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) const | ||
{ | ||
core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz); | ||
return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", "false", "false"); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- the shader would need to have a name
default_direct
instead ofdirect
if it requires you define something for it to work - if possible its better to build a source with
ostringstream
without overriden copies, so you have this
#include "nbl/builtin/hlsl/scan/default_direct.hlsl"
void main()
{
nbl::hlsl::scan::default_direct<$Type,$Binop,$dataType,$scratchSize>::__call();
}
where Type is some template param that helps you tell apart, reduction, inclusive or exclusive scan
{ | ||
core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz); | ||
return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", scanType == EST_EXCLUSIVE ? "true" : "false", "true"); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same comment as for CReduce
asset::ICPUShader* CScanner::getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) | ||
{ | ||
if (!m_shaders[scanType][dataType][op]) | ||
m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz); | ||
return m_shaders[scanType][dataType][op].get(); | ||
} | ||
|
||
#if 0 // TODO: port | ||
core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const | ||
IGPUShader* CScanner::getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) | ||
{ | ||
auto system = m_device->getPhysicalDevice()->getSystem(); | ||
core::smart_refctd_ptr<const system::IFile> glsl; | ||
{ | ||
auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile> | ||
{ | ||
nbl::system::ISystem::future_t<core::smart_refctd_ptr<nbl::system::IFile>> future; | ||
system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE); | ||
if (future.wait()) | ||
return future.copy(); | ||
return nullptr; | ||
}; | ||
if (!m_specialized_shaders[scanType][dataType][op]) | ||
{ | ||
auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op,scratchSz)); | ||
cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); | ||
cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); | ||
|
||
auto gpushader = m_device->createShader(cpuShader.get()); | ||
|
||
m_specialized_shaders[scanType][dataType][op] = gpushader; | ||
} | ||
return m_specialized_shaders[scanType][dataType][op].get(); | ||
} | ||
|
||
if(indirect) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these 3 utilities are practically identical for CScan and CReduce, why not hoist them to CArithmeticOps?
void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) | ||
{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The parameters are "married" to to the scheduling logic.
So make these methods of DefaultSchedulerParameters_t
, this one is really just a constructor so turn it into a static DefaultSchedulerParameters_t create(...)
method
Hint: whenever a function takes inout
or out
struct T
as first parameter, it likely means that I wrote it in GLSL to be a pseudo-method (would have a C fn ptr signature matching a method).
Bonus Hint: Whenever the first param is a in T
it would have been a const method in C++.
* The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation | ||
* (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions | ||
* that are used for synchronization etc. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The logic needs to change to "last one out closes the door"
The comments too.
we will now only launch RoundUp(N/ItemsPerWorkgroup)
virtual workgroups, not the geometric series
* Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount. | ||
* The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we do "last workgroup takes over" we will now only launch RoundUp(N/ItemsPerWorkgroup)
virtual workgroups, not the geometric series
So no need for the cumulativeWorkgroupCount
.
* |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps | ||
* synchronizing between workgroups with while-loop spinning. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
amend comments as needed
uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_LEVELS]; | ||
uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_LEVELS]; | ||
uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_LEVELS]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not sure you need either cumulativeWorkgroupCount
or lastWorkgroupSetCountForLevel
for a "last one out closes the door" upsweep
default: | ||
break; | ||
#if NBL_BUILTIN_MAX_SCAN_LEVELS>7 | ||
const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
modify it to not use Vulkan/SPIR-V environment builtins, that way we can use it from C++ too!
(So take workgroup size - or more accurately item per workgroup count - from the outside)
#endif | ||
// REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) | ||
struct Parameters_t { | ||
uint32_t lastElement[NBL_BUILTIN_MAX_LEVELS/2+1]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add documentation about what lastElement
is used for
_schedulerParams.finishedFlagOffset[1] = 1u; | ||
|
||
_scanParams.temporaryStorageOffset[0] = 2u; | ||
break; | ||
case 2u: | ||
_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); | ||
_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u; | ||
_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1); | ||
_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0); | ||
// climb up | ||
_schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); | ||
_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u; | ||
// climb down | ||
_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u; | ||
|
||
_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1); | ||
_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); | ||
break; | ||
case 3u: | ||
_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); | ||
_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2); | ||
_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
because due to forward progress guarantees you can only do upsweep or downsweep in a single dispatch, you don't need the "climb down"
you probably don't even need the cumulative workgroup counts, as the way you'd find out if your current workgroup truly is the last one, is by checking the return value of the finished flag atomic counter.
e.g. markFinished
uint32_t howManyDone = flagsAccessor.atomicAdd(finishedOffset[level]+virtualWorkgroupIndexInThisLevel);
//uint32_t currentLastWorkgroup = currentLastElement>>ItemsPerWorkgroupLog2;
uint32_t nextLevelLastWorkgroup = currentLastWorkgroup >>ItemsPerWorkgroupLog2;
uint32_t virtualWorkgroupIndexInNextLevel = virtualWorkgroupIndexInThisLevel>>ItemsPerWorkgroupLog2;
// NOT: either last workgroup, or in the last bundle of workgroups and last workgroup dynamically there
if (howManyDone!=(ItemsPerWorkgroup-1) && (virtualWorkgroupIndexInNextLevel!=nextLevelLastWorkgroup || (virtualWorkgroupIndexInThisLevel&(ItemsPerWorkgroupLog2-1))!=howManyDone) )
return;
// last one takes over
currentLastWorkgroup = nextLevelLastWorkgroup ;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
btw I'd try to rewrite this as a loop instead of a weird switch that was only written like this to avoid recursion in GLSL
/** | ||
* treeLevel - the current level in the Blelloch Scan | ||
* localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. | ||
* For example, if we have dispatched 10 workgroups and we the virtu al workgroup number is 35, then the localWorkgroupIndex should be 5. | ||
*/ | ||
template<class Accessor> | ||
bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) | ||
{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can't do any flagging via sharedmemory because you want inter-workgroup communication, must be a BDA address!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
even better a bda::__ptr<uint32_t>
cause then you'll have atomicAdd
methods
sharedScratch.workgroupExecutionAndMemoryBarrier(); | ||
const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); | ||
|
||
treeLevel = sharedScratch.get(1u); | ||
if(treeLevel>lastLevel) | ||
return true; | ||
|
||
localWorkgroupIndex = globalWorkgroupIndex; | ||
const bool dependentLevel = treeLevel != 0u; | ||
if(dependentLevel) | ||
{ | ||
const uint32_t prevLevel = treeLevel - 1u; | ||
localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel]; | ||
if(workgroup::SubgroupContiguousIndex() == 0u) | ||
{ | ||
uint32_t dependentsCount = 1u; | ||
if(treeLevel <= topLevel) | ||
{ | ||
dependentsCount = glsl::gl_WorkGroupSize().x; | ||
const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel]; | ||
if (lastWorkgroup) | ||
{ | ||
const Parameters_t scanParams = getParameters(); | ||
dependentsCount = scanParams.lastElement[treeLevel]+1u; | ||
if (treeLevel<topLevel) | ||
{ | ||
dependentsCount -= scanParams.lastElement[treeLevel+1u] * glsl::gl_WorkGroupSize().x; | ||
} | ||
} | ||
} | ||
uint32_t dependentsFinishedFlagOffset = localWorkgroupIndex; | ||
if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above? | ||
dependentsFinishedFlagOffset /= glsl::gl_WorkGroupSize().x; | ||
dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel]; | ||
while (scanScratchBuf[0].data[dependentsFinishedFlagOffset]!=dependentsCount) | ||
glsl::memoryBarrierBuffer(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I expect a rewrite of this into "last one out closes the door"
if (treeLevel<topLevel) | ||
{ | ||
finishedFlagOffset += localWorkgroupIndex/glsl::gl_WorkGroupSize().x; | ||
glsl::atomicAdd(scanScratchBuf[0].data[finishedFlagOffset], 1u); | ||
} | ||
else if (treeLevel!=(topLevel<<1u)) | ||
{ | ||
finishedFlagOffset += localWorkgroupIndex; | ||
scanScratchBuf[0].data[finishedFlagOffset] = 1u; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
comments about what and why is happenning in the if statements
template<class Accessor> | ||
bool getWork(NBL_REF_ARG(Accessor) accessor) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
whats the accessor for? needs a better name
Also why don't you store the accessor past the create
as a member?
It should now work with all sizes of inputs.
Description
Initial implementation of the Global Scan in HLSL
Testing
Not yet finished
TODO list:
Still need to test the HLSL code as well as the porting of the example code to the new API of vulkan_1_3 branch