Global scan #665

kpentaris · 2024-03-11T18:34:43Z

Description

Initial implementation of the Global Scan in HLSL

Testing

Not yet finished

TODO list:

Still need to test the HLSL code as well as the porting of the example code to the new API of vulkan_1_3 branch

…but not working yet

devshgraphicsprogrammingjenkins · 2024-03-11T18:35:02Z

[CI]: Can one of the admins verify this patch?

…al_scan

devshgraphicsprogramming · 2024-04-02T09:42:06Z

include/nbl/builtin/hlsl/scan/declarations.hlsl

+#ifndef NBL_BUILTIN_MAX_SCAN_LEVELS
+#define NBL_BUILTIN_MAX_SCAN_LEVELS 7
+#endif


do as NBL_CONSTEXPR instead of define

also this better be formulated slightly differently, in terms of reduction passes, not reduction + downsweep

devshgraphicsprogramming · 2024-04-02T09:42:30Z

include/nbl/builtin/hlsl/scan/declarations.hlsl

+ uint32_t topLevel;
+ uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];


use uint16_t for this

devshgraphicsprogramming · 2024-04-02T09:42:56Z

include/nbl/builtin/hlsl/scan/declarations.hlsl

+ uint32_t topLevel;
+ uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
+ };
+
 Parameters_t getParameters();


no need for such a forward decl anymore

devshgraphicsprogramming · 2024-04-02T09:52:12Z

include/nbl/builtin/hlsl/scan/declarations.hlsl

+ struct DefaultSchedulerParameters_t
+ {
+ uint32_t finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1];
+ uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS];
+
+ };
+
+ DefaultSchedulerParameters_t getSchedulerParameters();


split out schedulers into separate headers

devshgraphicsprogramming · 2024-04-02T09:53:37Z

include/nbl/builtin/hlsl/scan/declarations.hlsl

 template<typename Storage_t>
 void getData(
- inout Storage_t data,
- in uint levelInvocationIndex,
- in uint localWorkgroupIndex,
- in uint treeLevel,
- in uint pseudoLevel
+ NBL_REF_ARG(Storage_t) data,
+ NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+ NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+ NBL_CONST_REF_ARG(uint32_t) treeLevel,
+ NBL_CONST_REF_ARG(uint32_t) pseudoLevel
 );
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-#endif

-#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
 template<typename Storage_t>
 void setData(
- in Storage_t data,
- in uint levelInvocationIndex,


no more forward declarations, just let it be an accessor.

Also the level const-ref-args can be rolled up into a single struct SDataIndex

devshgraphicsprogramming · 2024-04-02T09:56:28Z

include/nbl/builtin/hlsl/scan/descriptors.hlsl

+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h

+#ifndef _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_
+#define _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_

-// choerent -> globallycoherent
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+
+// coherent -> globallycoherent
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<uint32_t scratchElementCount=scratchSz> // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
+struct Scratch
+{
+ uint32_t workgroupsStarted;
+ uint32_t data[scratchElementCount];
+};
+
+[[vk::binding(0 ,0)]] RWStructuredBuffer<uint32_t /*Storage_t*/> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?)
+[[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model
+
+template<typename Storage_t, bool isExclusive=false>
+void getData(
+ NBL_REF_ARG(Storage_t) data,
+ NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+ NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+ NBL_CONST_REF_ARG(uint32_t) treeLevel,
+ NBL_CONST_REF_ARG(uint32_t) pseudoLevel
+)
+{
+ const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders
+
+ uint32_t offset = levelInvocationIndex;
+ const bool notFirstOrLastLevel = bool(pseudoLevel);
+ if (notFirstOrLastLevel)
+ offset += params.temporaryStorageOffset[pseudoLevel-1u];
+


remove this whole file, it should be userspace

devshgraphicsprogramming · 2024-04-02T09:57:12Z

include/nbl/builtin/hlsl/scan/direct.hlsl

+// TODO: Can we make it a static variable?
+groupshared uint32_t wgScratch[SharedScratchSz];
+
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+
+template<uint16_t offset>
+struct WGScratchProxy
+{
+ uint32_t get(const uint32_t ix)
+ {
+ return wgScratch[ix+offset];
+ }
+ void set(const uint32_t ix, const uint32_t value)
+ {
+ wgScratch[ix+offset] = value;
+ }
+
+ uint32_t atomicAdd(uint32_t ix, uint32_t val)
+ {
+ return glsl::atomicAdd(wgScratch[ix + offset], val);
+ }
+
+ void workgroupExecutionAndMemoryBarrier()
+ {
+ nbl::hlsl::glsl::barrier();
+ //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+ }
+};
+static WGScratchProxy<0> accessor;


scratches are userspace

use accessors

devshgraphicsprogramming · 2024-04-02T09:57:35Z

include/nbl/builtin/hlsl/scan/direct.hlsl

+/**
+ * Required since we rely on SubgroupContiguousIndex instead of 
+ * gl_LocalInvocationIndex which means to match the global index 
+ * we can't use the gl_GlobalInvocationID but an index based on 
+ * SubgroupContiguousIndex.
+ */
+uint32_t globalIndex()
+{
+ return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}


we have this in a header already

devshgraphicsprogramming · 2024-04-02T09:57:53Z

include/nbl/builtin/hlsl/scan/direct.hlsl

+struct ScanPushConstants
+{
+ nbl::hlsl::scan::Parameters_t scanParams;
+ nbl::hlsl::scan::DefaultSchedulerParameters_t schedulerParams;
+};
+
+[[vk::push_constant]]
+ScanPushConstants spc;


everything affecting the pipeline layout should be userspace

devshgraphicsprogramming · 2024-04-02T09:58:52Z

include/nbl/builtin/hlsl/scan/indirect.hlsl

-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
 {
- if (bool(nbl::hlsl::scan::getIndirectElementCount()))
- nbl::hlsl::scan::main();
+ if(bool(nbl::hlsl::scan::getIndirectElementCount())) {
+ // TODO call main from virtual_workgroup.hlsl
+ }


all this should be userspace, also there will be very little difference between direct and indirect

devshgraphicsprogramming · 2024-04-02T10:03:20Z

include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl

 namespace nbl
 {
 namespace hlsl
 {
 namespace scan
 {
- template<class Binop, class Storage_t>
- void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex)
+ template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor, class device_capabilities=void>


why not template on the workgroup scan instead?

Within you can alias and extract:

binop

storage_t

exclusive or not

item count

smem accessor

device traits necessary

devshgraphicsprogramming · 2024-04-02T10:04:02Z

include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl

+ const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + SubgroupContiguousIndex();
+ const bool lastInvocationInGroup = SubgroupContiguousIndex() == (gl_WorkGroupSize().x - 1);


shouldn't ItemCount be used instead of glsl::gl_WorkGroupSize().x ?

most definitely

devshgraphicsprogramming · 2024-06-14T17:52:27Z

include/nbl/video/utilities/CArithmeticOps.h

+ // push constants of the default direct scan pipeline provide both aux memory offset params and scheduling params
+ struct DefaultPushConstants
+ {
+ Parameters scanParams;
+ SchedulerParameters schedulerParams;
+ };
+
+ struct DispatchInfo
+ {
+ DispatchInfo() : wg_count(0u)
+ {
+ }
+
+ // in case we scan very few elements, you don't want to launch workgroups that wont do anything
+ DispatchInfo(const IPhysicalDevice::SLimits& limits, const uint32_t elementCount, const uint32_t workgroupSize)
+ {
+ constexpr auto workgroupSpinningProtection = 4u; // to prevent first workgroup starving/idling on level 1 after finishing level 0 early
+ wg_count = limits.computeOptimalPersistentWorkgroupDispatchSize(elementCount,workgroupSize,workgroupSpinningProtection);
+ }
+
+ uint32_t wg_count;
+ };


move everything to HLSL, you can also have a look at PR #643 where the computeOptimalPErsistentWorkgroupDispatchSize may move to device_capability_traits

devshgraphicsprogramming · 2024-06-14T17:53:30Z

include/nbl/video/utilities/CArithmeticOps.h

+ protected:
+ // REVIEW: Does it need an empty destructor?


no a default one will be made

devshgraphicsprogramming · 2024-06-14T17:53:45Z

include/nbl/video/utilities/CArithmeticOps.h

+ // REVIEW: Does it need an empty destructor?
+
+ core::smart_refctd_ptr<ILogicalDevice> m_device;
+ core::smart_refctd_ptr<IGPUDescriptorSetLayout> m_ds_layout;


you wont be using any descriptors, so no need for a layout

devshgraphicsprogramming · 2024-06-14T17:54:27Z

include/nbl/video/utilities/CArithmeticOps.h

+ static inline void updateDescriptorSet(ILogicalDevice* device, IGPUDescriptorSet* set, const asset::SBufferRange<IGPUBuffer>& input_range, const asset::SBufferRange<IGPUBuffer>& scratch_range)
+ {
+ IGPUDescriptorSet::SDescriptorInfo infos[2];
+ infos[0].desc = input_range.buffer;
+ infos[0].info.buffer = {input_range.offset,input_range.size};
+ infos[1].desc = scratch_range.buffer;
+ infos[1].info.buffer = {scratch_range.offset,scratch_range.size};
+
+ video::IGPUDescriptorSet::SWriteDescriptorSet writes[2];
+ for (auto i=0u; i<2u; i++)
+ {
+ writes[i] = { .dstSet = set,.binding = i,.arrayElement = 0u,.count = 1u,.info = infos+i };
+ }
+
+ device->updateDescriptorSets(2, writes, 0u, nullptr);
+ }


no more descriptor sets

devshgraphicsprogramming · 2024-06-14T17:54:35Z

include/nbl/video/utilities/CArithmeticOps.h

+ inline void buildParameters(const uint32_t elementCount, DefaultPushConstants& pushConstants, DispatchInfo& dispatchInfo)
+ {
+ pushConstants.schedulerParams = SchedulerParameters(pushConstants.scanParams,elementCount,m_workgroupSize);
+ dispatchInfo = DispatchInfo(m_device->getPhysicalDevice()->getLimits(),elementCount,m_workgroupSize);
+ }


move to HLSL

devshgraphicsprogramming · 2024-06-14T17:54:55Z

include/nbl/video/utilities/CArithmeticOps.h

+ const IGPUDescriptorSetLayout::SBinding bindings[2] = {
+ { 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer
+ { 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch
+ };
+
+ m_ds_layout = m_device->createDescriptorSetLayout(bindings);
+ assert(m_ds_layout && "CArithmeticOps Descriptor Set Layout was not created successfully");


no bindings

devshgraphicsprogramming · 2024-06-14T17:55:41Z

include/nbl/video/utilities/CArithmeticOps.h

+ inline core::smart_refctd_ptr<asset::ICPUShader> createBaseShader(const char* shaderFile, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
+ {
+ auto system = m_device->getPhysicalDevice()->getSystem();
+ core::smart_refctd_ptr<const system::IFile> hlsl;
+ {
+ auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile>
+ {
+ nbl::system::ISystem::future_t<core::smart_refctd_ptr<nbl::system::IFile>> future;
+ system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE);
+ if (future.wait())
+ return future.copy();
+ return nullptr;
+ };
+
+ // hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect_reduce.hlsl"); ??
+ hlsl = loadBuiltinData(shaderFile);
+ }


this is super convoluted, why is the user supposed to know the path to the file you'll load?

devshgraphicsprogramming · 2024-06-14T17:56:40Z

include/nbl/video/utilities/CArithmeticOps.h

+ break;
+ }
+
+ return asset::CHLSLCompiler::createOverridenCopy(cpushader.get(), "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_SZ %d\n", m_workgroupSize, storageType, binop, scratchSz);


it would be better to build the shader source instead of overriding some random shader

devshgraphicsprogramming · 2024-06-14T17:57:16Z

include/nbl/video/utilities/CArithmeticOps.h

+ CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device) : CArithmeticOps(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}
+
+ CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize)


if you don't plan to make CArithmeticOps be usable on its own, make the constructors protected

devshgraphicsprogramming · 2024-06-14T17:58:50Z

include/nbl/video/utilities/CReduce.h

+ {
+
+ public:
+ CReduce(core::smart_refctd_ptr<ILogicalDevice>&& device) : CReduce(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}


don't repeat the roundDownToPoT everywhere, you should rely on CArithmeticOps to set that up for you

devshgraphicsprogramming · 2024-06-14T18:00:14Z

include/nbl/video/utilities/CScanner.h


+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+static_assert(NBL_BUILTIN_MAX_LEVELS & 0x1, "NBL_BUILTIN_MAX_LEVELS must be odd!");


if we don't have our own static_assert in HLSL, then stick this inside a ifndef __HLSL_VERSION inside the HLSL header

devshgraphicsprogramming · 2024-06-14T18:01:42Z

include/nbl/video/utilities/CScanner.h

@@ -143,270 +142,43 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about



update the comments about the algo between lines 112-141 because we can't spinwait, we need to outline the new algo properly

devshgraphicsprogramming · 2024-06-14T18:01:59Z

include/nbl/video/utilities/CScanner.h

- EST_INCLUSIVE = _NBL_GLSL_SCAN_TYPE_INCLUSIVE_,
+ EST_INCLUSIVE = 0u,
 // computes output[n] = Sum_{i<n}(input[i]), meaning first element is identity
- EST_EXCLUSIVE = _NBL_GLSL_SCAN_TYPE_EXCLUSIVE_,
+ EST_EXCLUSIVE,


I'd template on a bool instead

devshgraphicsprogramming · 2024-06-14T18:10:50Z

src/nbl/video/utilities/CReduce.cpp

+core::smart_refctd_ptr<asset::ICPUShader> CReduce::createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) const
+{
+ core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz);
+ return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", "false", "false");
+}


the shader would need to have a name default_direct instead of direct if it requires you define something for it to work

if possible its better to build a source with ostringstream without overriden copies, so you have this

#include "nbl/builtin/hlsl/scan/default_direct.hlsl" void main() { nbl::hlsl::scan::default_direct<$Type,$Binop,$dataType,$scratchSize>::__call(); }

where Type is some template param that helps you tell apart, reduction, inclusive or exclusive scan

devshgraphicsprogramming · 2024-06-14T18:11:18Z

src/nbl/video/utilities/CScanner.cpp

+{
+ core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz);
+ return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", scanType == EST_EXCLUSIVE ? "true" : "false", "true");
+}


same comment as for CReduce

devshgraphicsprogramming · 2024-06-14T18:11:53Z

src/nbl/video/utilities/CScanner.cpp

+asset::ICPUShader* CScanner::getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
+{
+ if (!m_shaders[scanType][dataType][op])
+ m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz);
+ return m_shaders[scanType][dataType][op].get();
+}

-#if 0 // TODO: port
-core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const
+IGPUShader* CScanner::getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
 {
- auto system = m_device->getPhysicalDevice()->getSystem();
- core::smart_refctd_ptr<const system::IFile> glsl;
- {
- auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile>
- {
- nbl::system::ISystem::future_t<core::smart_refctd_ptr<nbl::system::IFile>> future;
- system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE);
- if (future.wait())
- return future.copy();
- return nullptr;
- };
+ if (!m_specialized_shaders[scanType][dataType][op])
+ {
+ auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op,scratchSz));
+ cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
+ cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
+
+ auto gpushader = m_device->createShader(cpuShader.get());
+
+ m_specialized_shaders[scanType][dataType][op] = gpushader;
+ }
+ return m_specialized_shaders[scanType][dataType][op].get();
+}

- if(indirect)


these 3 utilities are practically identical for CScan and CReduce, why not hoist them to CArithmeticOps?

devshgraphicsprogramming · 2024-06-16T07:20:13Z

include/nbl/builtin/hlsl/scan/default_scheduler.hlsl

+ void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
+ {


The parameters are "married" to to the scheduling logic.

So make these methods of DefaultSchedulerParameters_t , this one is really just a constructor so turn it into a static DefaultSchedulerParameters_t create(...) method

Hint: whenever a function takes inout or out struct T as first parameter, it likely means that I wrote it in GLSL to be a pseudo-method (would have a C fn ptr signature matching a method).

Bonus Hint: Whenever the first param is a in T it would have been a const method in C++.

devshgraphicsprogramming · 2024-06-16T07:25:11Z

include/nbl/builtin/hlsl/scan/default_scheduler.hlsl

+ * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation
+ * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions 
+ * that are used for synchronization etc.


The logic needs to change to "last one out closes the door"

The comments too.

we will now only launch RoundUp(N/ItemsPerWorkgroup) virtual workgroups, not the geometric series

devshgraphicsprogramming · 2024-06-16T07:30:17Z

include/nbl/builtin/hlsl/scan/default_scheduler.hlsl

+ * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount.
+ * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase.


Since we do "last workgroup takes over" we will now only launch RoundUp(N/ItemsPerWorkgroup) virtual workgroups, not the geometric series

So no need for the cumulativeWorkgroupCount.

devshgraphicsprogramming · 2024-06-16T07:31:15Z

include/nbl/builtin/hlsl/scan/default_scheduler.hlsl

+ * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps 
+ * synchronizing between workgroups with while-loop spinning.


amend comments as needed

devshgraphicsprogramming · 2024-06-16T07:33:39Z

include/nbl/builtin/hlsl/scan/declarations.hlsl

+ uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_LEVELS];
+ uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_LEVELS];
+ uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_LEVELS];


not sure you need either cumulativeWorkgroupCount or lastWorkgroupSetCountForLevel for a "last one out closes the door" upsweep

devshgraphicsprogramming · 2024-06-16T07:36:49Z

include/nbl/builtin/hlsl/scan/default_scheduler.hlsl

- default:
- break;
-#if NBL_BUILTIN_MAX_SCAN_LEVELS>7
+ const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x);


modify it to not use Vulkan/SPIR-V environment builtins, that way we can use it from C++ too!

(So take workgroup size - or more accurately item per workgroup count - from the outside)

devshgraphicsprogramming · 2024-06-16T07:37:55Z

include/nbl/builtin/hlsl/scan/declarations.hlsl

-#endif
+ // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
+ struct Parameters_t {
+ uint32_t lastElement[NBL_BUILTIN_MAX_LEVELS/2+1];


add documentation about what lastElement is used for

devshgraphicsprogramming · 2024-06-16T07:54:30Z

include/nbl/builtin/hlsl/scan/default_scheduler.hlsl

+ _schedulerParams.finishedFlagOffset[1] = 1u;
+
+ _scanParams.temporaryStorageOffset[0] = 2u;
+ break;
+ case 2u:
+ _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
+ _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u;
+ _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1);
+ _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0);
+ // climb up
+ _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
+ _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u;
+ // climb down
+ _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u;
+
+ _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1);
+ _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
+ break;
+ case 3u:
+ _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
+ _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2);
+ _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u;


because due to forward progress guarantees you can only do upsweep or downsweep in a single dispatch, you don't need the "climb down"

you probably don't even need the cumulative workgroup counts, as the way you'd find out if your current workgroup truly is the last one, is by checking the return value of the finished flag atomic counter.

e.g. markFinished

uint32_t howManyDone = flagsAccessor.atomicAdd(finishedOffset[level]+virtualWorkgroupIndexInThisLevel); //uint32_t currentLastWorkgroup = currentLastElement>>ItemsPerWorkgroupLog2; uint32_t nextLevelLastWorkgroup = currentLastWorkgroup >>ItemsPerWorkgroupLog2; uint32_t virtualWorkgroupIndexInNextLevel = virtualWorkgroupIndexInThisLevel>>ItemsPerWorkgroupLog2; // NOT: either last workgroup, or in the last bundle of workgroups and last workgroup dynamically there if (howManyDone!=(ItemsPerWorkgroup-1) && (virtualWorkgroupIndexInNextLevel!=nextLevelLastWorkgroup || (virtualWorkgroupIndexInThisLevel&(ItemsPerWorkgroupLog2-1))!=howManyDone) ) return; // last one takes over currentLastWorkgroup = nextLevelLastWorkgroup ;

btw I'd try to rewrite this as a loop instead of a weird switch that was only written like this to avoid recursion in GLSL

devshgraphicsprogramming · 2024-06-16T16:20:06Z

include/nbl/builtin/hlsl/scan/default_scheduler.hlsl

+ /**
+ * treeLevel - the current level in the Blelloch Scan
+ * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. 
+ * For example, if we have dispatched 10 workgroups and we the virtu al workgroup number is 35, then the localWorkgroupIndex should be 5.
+ */
+ template<class Accessor>
+ bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch)
+ {


you can't do any flagging via sharedmemory because you want inter-workgroup communication, must be a BDA address!

even better a bda::__ptr<uint32_t> cause then you'll have atomicAdd methods

devshgraphicsprogramming · 2024-06-16T16:21:03Z

include/nbl/builtin/hlsl/scan/default_scheduler.hlsl

+ sharedScratch.workgroupExecutionAndMemoryBarrier();
+ const uint32_t globalWorkgroupIndex = sharedScratch.get(0u);
+
+ treeLevel = sharedScratch.get(1u);
+ if(treeLevel>lastLevel)
+ return true;
+
+ localWorkgroupIndex = globalWorkgroupIndex;
+ const bool dependentLevel = treeLevel != 0u;
+ if(dependentLevel) 
+ {
+ const uint32_t prevLevel = treeLevel - 1u;
+ localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel];
+ if(workgroup::SubgroupContiguousIndex() == 0u) 
+ {
+ uint32_t dependentsCount = 1u;
+ if(treeLevel <= topLevel) 
+ {
+ dependentsCount = glsl::gl_WorkGroupSize().x;
+ const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel];
+ if (lastWorkgroup) 
+ {
+ const Parameters_t scanParams = getParameters();
+ dependentsCount = scanParams.lastElement[treeLevel]+1u;
+ if (treeLevel<topLevel) 
+ {
+ dependentsCount -= scanParams.lastElement[treeLevel+1u] * glsl::gl_WorkGroupSize().x;
+ }
+ }
+ }
+ uint32_t dependentsFinishedFlagOffset = localWorkgroupIndex;
+ if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above?
+ dependentsFinishedFlagOffset /= glsl::gl_WorkGroupSize().x;
+ dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel];
+ while (scanScratchBuf[0].data[dependentsFinishedFlagOffset]!=dependentsCount)
+ glsl::memoryBarrierBuffer();


I expect a rewrite of this into "last one out closes the door"

devshgraphicsprogramming · 2024-06-16T16:21:43Z

include/nbl/builtin/hlsl/scan/default_scheduler.hlsl

+ if (treeLevel<topLevel)
+ {
+ finishedFlagOffset += localWorkgroupIndex/glsl::gl_WorkGroupSize().x;
+ glsl::atomicAdd(scanScratchBuf[0].data[finishedFlagOffset], 1u);
+ }
+ else if (treeLevel!=(topLevel<<1u))
+ {
+ finishedFlagOffset += localWorkgroupIndex;
+ scanScratchBuf[0].data[finishedFlagOffset] = 1u;
+ }


comments about what and why is happenning in the if statements

devshgraphicsprogramming · 2024-06-16T16:23:51Z

include/nbl/builtin/hlsl/scan/scheduler.hlsl

+ template<class Accessor>
+ bool getWork(NBL_REF_ARG(Accessor) accessor)


whats the accessor for? needs a better name

Also why don't you store the accessor past the create as a member?

It should now work with all sizes of inputs.

kpentaris added 8 commits January 29, 2024 12:51

Merge branch 'atomicAdd-fix' into global_scan

a34827a

First commit for global scan implementation. Ported most of the code …

d61a2aa

…but not working yet

Global scan migration of code to new APIs

0eec0ee

Merge branch 'Devsh-Graphics-Programming:master' into global_scan

2185b4e

First commit for global scan implementation. Ported most of the code …

2df4ad7

…but not working yet

Global scan migration of code to new APIs

20a60f5

Merge remote

960dbf2

Merge branch 'Devsh-Graphics-Programming:master' into global_scan

2be8f13

kpentaris added 4 commits March 15, 2024 11:21

Merge branch 'master' into global_scan

6366697

Merge branch 'master' into global_scan

26cb75d

Update CScanner.h to new NBL API

7117cc3

Merge branch 'master' of https://github.com/kpentaris/Nabla into glob…

fa979c7

…al_scan

devshgraphicsprogramming reviewed Apr 2, 2024

View reviewed changes

kpentaris added 5 commits April 4, 2024 00:40

Merge branch 'master' into global_scan

53e1655

Merge branch 'master' into global_scan

0bcc325

Merge branch 'master' into global_scan

71f4398

Fix CScanner and related shaders to properly compile

51b4f74

Merge branch 'master' into global_scan

4b29049

devshgraphicsprogramming reviewed Jun 14, 2024

View reviewed changes

devshgraphicsprogramming reviewed Jun 16, 2024

View reviewed changes

kpentaris added 2 commits June 17, 2024 11:11

Fix issues with global reduce algorithm

40953e3

It should now work with all sizes of inputs.

Merge branch 'Devsh-Graphics-Programming:master' into global_scan

cc43691

		uint32_t topLevel;
		uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];

		const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + SubgroupContiguousIndex();
		const bool lastInvocationInGroup = SubgroupContiguousIndex() == (gl_WorkGroupSize().x - 1);

		CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device) : CArithmeticOps(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}

		CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize)


		#include "nbl/builtin/hlsl/scan/declarations.hlsl"
		static_assert(NBL_BUILTIN_MAX_LEVELS & 0x1, "NBL_BUILTIN_MAX_LEVELS must be odd!");

		@@ -143,270 +142,43 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about

		void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
		{

		* Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount.
		* The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase.

		* \|> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps
		* synchronizing between workgroups with while-loop spinning.

		template<class Accessor>
		bool getWork(NBL_REF_ARG(Accessor) accessor)

Global scan #665

Are you sure you want to change the base?

Global scan #665

Conversation

kpentaris commented Mar 11, 2024

Description

Testing

TODO list:

devshgraphicsprogrammingjenkins commented Mar 11, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

devshgraphicsprogramming Apr 2, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

devshgraphicsprogramming Jun 16, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

devshgraphicsprogramming Apr 2, 2024 •

edited

devshgraphicsprogramming Jun 16, 2024 •

edited